DataFrame

DataFrame module: columnar, lazy-evaluated tabular data.

Design: typed columns (IntCol/FloatCol/StrCol/BoolCol + nullable variants), a LazyFrame / Plan ADT for deferred execution, and a ColExpr language for vectorized filter predicates (no row materialization).

Phase 1 : types, construction, column/row access, structural ops Phase 2 : CSV and JSON I/O Phase 3 : ColExpr, LazyFrame, Plan interpreter Phase 4 : GroupBy + aggregation (uses Stats module) Phase 5 : inner_join, left_join, right_join, outer_join Phase 6 : describe, value_counts, col_z_score, col_normalize

Types

typeValueValue =#

typeColumnColumn =#

typeRowRow = Row(List((String, Value)))#

typeDataFrameDataFrame = DataFrame(List(Column))#

typeColumnBuilderColumnBuilder =#

typeColExprColExpr =#

typeSortDirSortDir = Asc | Desc#

typeJoinKindJoinKind = Inner | Left | Right | Outer#

typePlanPlan =#

typeLazyFrameLazyFrame = LazyFrame(Plan)#

typeCsvWriteOptsCsvWriteOpts = CsvWriteOpts(String, String, Bool)#

typeAggExprAggExpr =#

typeGroupKeyGroupKey = GroupKey(List(Value))#

typeGroupedFrameGroupedFrame = GroupedFrame(DataFrame, List(String))#

typeColStatsColStats = ColStats(String, String, Int, Option(Float), Option(Float),#

typeWindowExprWindowExpr =#

Functions

fnmake_dfmake_df(cols : List(Column)) : DataFrame#

Construct a DataFrame directly from a list of columns. No length check.

fncol_namecol_name(col : Column) : String#

Returns the name of a column.

fncol_lencol_len(col : Column) : Int#

Returns the length (number of rows) of a column.

fncol_value_atcol_value_at(col : Column, i : Int) : Value#

fncol_to_value_listcol_to_value_list(col : Column) : List(Value)#

fnfilter_col_by_indicesfilter_col_by_indices(col : Column, indices : List(Int)) : Column#

fnfilter_col_by_maskfilter_col_by_mask(col : Column, mask : TypedArray(Bool)) : Column#

fnreorder_colreorder_col(col : Column, indices : List(Int)) : Column#

fncol_is_nullablecol_is_nullable(col : Column) : Bool#

Returns true if the column is a nullable variant.

fncol_null_countcol_null_count(col : Column) : Int#

Returns the number of null entries in a nullable column (0 for non-nullable).

fncol_to_nullablecol_to_nullable(col : Column) : Column#

Wrap a non-nullable column in a nullable variant (all-false bitmap). No-op on already-nullable columns.

fnvalue_to_stringvalue_to_string(v : Value) : String#

Converts a Value to its string representation.

fnvalue_type_strvalue_type_str(v : Value) : String#

fnvalues_equalvalues_equal(a : Value, b : Value) : Bool#

fncompare_valuescompare_values(a : Value, b : Value) : Int#

fnvalue_hashvalue_hash(v : Value) : Int#

fnvalues_list_eqvalues_list_eq(as_ : List(Value), bs : List(Value)) : Bool#

fnbuilder_namebuilder_name(b : ColumnBuilder) : String#

fnbuilder_push_strictbuilder_push_strict(b : ColumnBuilder, val : Value) : Result(ColumnBuilder, String)#

fnbuilder_push_widenbuilder_push_widen(b : ColumnBuilder, val : Value) : ColumnBuilder#

fnbuilder_to_columnbuilder_to_column(b : ColumnBuilder) : Column#

fnmake_buildermake_builder(name : String, val : Value) : ColumnBuilder#

fnemptyempty() : DataFrame#

Returns an empty DataFrame with no rows and no columns.

fnfrom_columnsfrom_columns(cols : List(Column)) : Result(DataFrame, String)#

Constructs a DataFrame from a list of pre-built columns. All columns must have equal length. Returns Err if they don't.

fnfrom_rowsfrom_rows(rows : List(Row)) : Result(DataFrame, String)#

Constructs a DataFrame from a list of Row values. Type inference is strict: if a column starts as Int, every subsequent value must also be IntVal. Errors on type mismatch or NullVal in Phase 1. For widening inference (Int→Float→String), use from_rows_widen.

fnfrom_rows_widenfrom_rows_widen(rows : List(Row)) : Result(DataFrame, String)#

fnschemaschema(df : DataFrame) : List(String)#

Returns the ordered list of column names.

fnrow_countrow_count(df : DataFrame) : Int#

Returns the number of rows. O(1) for single-column frames; O(cols) otherwise.

fncol_countcol_count(df : DataFrame) : Int#

Returns the number of columns.

fnget_columnget_column(df : DataFrame, name : String) : Result(Column, String)#

Returns the column with the given name, or Err if not found.

fnget_int_colget_int_col(df : DataFrame, name : String) : Result(List(Int), String)#

Returns the Int data list for a named column, or Err.

fnget_float_colget_float_col(df : DataFrame, name : String) : Result(List(Float), String)#

Returns the Float data list for a named column, or Err.

fnget_string_colget_string_col(df : DataFrame, name : String) : Result(List(String), String)#

Returns the String data list for a named column, or Err.

fnget_bool_colget_bool_col(df : DataFrame, name : String) : Result(List(Bool), String)#

Returns the Bool data list for a named column, or Err.

fnfloat_listfloat_list(df : DataFrame, name : String) : Result(List(Float), String)#

Returns a column as List(Float) for use with Stats functions. Int columns are promoted.

fnget_rowget_row(df : DataFrame, i : Int) : Row#

Materializes row i as a Row snapshot. O(col_count × i). Slow path.

fnto_rowsto_rows(df : DataFrame) : List(Row)#

Returns all rows as a list of Row values. O(rows × cols). Slow path.

fnrow_getrow_get(row : Row, name : String) : Option(Value)#

Look up a field in a Row by name. Returns None if not found.

fnrow_get_introw_get_int(row : Row, name : String) : Option(Int)#

Look up an Int field in a Row.

fnrow_get_floatrow_get_float(row : Row, name : String) : Option(Float)#

Look up a Float field in a Row. Int values are promoted to Float.

fnrow_get_stringrow_get_string(row : Row, name : String) : Option(String)#

Look up a String field in a Row.

fnrow_get_boolrow_get_bool(row : Row, name : String) : Option(Bool)#

Look up a Bool field in a Row.

fnadd_columnadd_column(df : DataFrame, col : Column) : Result(DataFrame, String)#

Adds a column to a DataFrame. Errors if name already exists or length mismatches.

fndrop_columndrop_column(df : DataFrame, name : String) : DataFrame#

Removes a column by name. No-op if not found.

fnrename_columnrename_column(df : DataFrame, old_name : String, new_name : String) : Result(DataFrame, String)#

Renames a column. Errors if old_name not found or new_name already exists.

fnrename_colrename_col(col : Column, new_name : String) : Column#

fnselect_columnsselect_columns(df : DataFrame, names : List(String)) : Result(DataFrame, String)#

Returns a DataFrame with columns reordered/selected by name list. Errors on missing names.

fnheadhead(df : DataFrame, n : Int) : DataFrame#

Returns the first n rows.

fntailtail(df : DataFrame, n : Int) : DataFrame#

Returns the last n rows.

fnsliceslice(df : DataFrame, start : Int, len : Int) : DataFrame#

Returns rows [start, start+len). Clamps to available rows.

fnslice_colslice_col(col : Column, start : Int, len : Int) : Column#

fndrop_rowsdrop_rows(df : DataFrame, n : Int) : DataFrame#

fnlazylazy(df : DataFrame) : LazyFrame#

Wraps a materialized DataFrame in a lazy frame.

fnfilterfilter(lf : LazyFrame, expr : ColExpr) : LazyFrame#

Appends a Filter node that uses a vectorized ColExpr predicate.

fnselectselect(lf : LazyFrame, cols : List(String)) : LazyFrame#

Projects a subset of columns.

fnwith_columnwith_column(lf : LazyFrame, name : String, f : Row -> Value) : LazyFrame#

Adds or replaces a column computed row-by-row using a March closure.

fnsort_bysort_by(lf : LazyFrame, keys : List((String, SortDir))) : LazyFrame#

Appends a sort node. Keys is a list of (column_name, SortDir) pairs.

fnlimitlimit(lf : LazyFrame, n : Int) : LazyFrame#

Limits the result to the first n rows.

fnoffsetoffset(lf : LazyFrame, n : Int) : LazyFrame#

Skips the first n rows.

fnrenamerename(lf : LazyFrame, old_name : String, new_name : String) : LazyFrame#

Renames a column in the pipeline.

fndropdrop(lf : LazyFrame, cols : List(String)) : LazyFrame#

Drops columns by name.

fncollectcollect(lf : LazyFrame) : Result(DataFrame, String)#

Materializes the lazy plan into a DataFrame.

fninner_joininner_join(lf : LazyFrame, right : DataFrame, keys : List(String)) : LazyFrame#

Only rows where every key column matches in both left and right are included in the output. Right key columns are not duplicated.

Example: let lf = DataFrame.lazy(orders_df) |> DataFrame.inner_join(products_df, ["product_id"]) let result = DataFrame.collect(lf) -- Only orders that reference a known product_id are included

fnleft_joinleft_join(lf : LazyFrame, right : DataFrame, keys : List(String)) : LazyFrame#

Rows in the left frame that have no match in the right frame are still included, with all right-only columns set to Null (NullableXxxCol). Right columns that are join keys are not duplicated.

Example: let lf = DataFrame.lazy(orders_df) |> DataFrame.left_join(customers_df, ["customer_id"]) let result = DataFrame.collect(lf) -- Every order row is present; customer_name is Null for unknown customers

fnright_joinright_join(lf : LazyFrame, right : DataFrame, keys : List(String)) : LazyFrame#

The mirror image of left_join. Every row in right appears in the output; rows with no match in the left frame get Null for every left-only column.

Example: let lf = DataFrame.lazy(transactions_df) |> DataFrame.right_join(reference_df, ["code"]) let result = DataFrame.collect(lf) -- Every reference row is included; transaction cols are Null for unmatched codes

fnouter_joinouter_join(lf : LazyFrame, right : DataFrame, keys : List(String)) : LazyFrame#

The union of left_join and right_join: every row from both the left and right frames appears in the output. Rows with no match on the other side get Null for all columns from that side.

Example: let lf = DataFrame.lazy(employees_df) |> DataFrame.outer_join(departments_df, ["dept_id"]) let result = DataFrame.collect(lf) -- All employees and all departments appear; Nulls where there is no match

fneval_planeval_plan(plan : Plan) : Result(DataFrame, String)#

fneval_col_expreval_col_expr(df : DataFrame, expr : ColExpr) : Result(Column, String)#

fnapply_filterapply_filter(df : DataFrame, expr : ColExpr) : Result(DataFrame, String)#

fnapply_with_columnapply_with_column(df : DataFrame, name : String, f : Row -> Value) : Result(DataFrame, String)#

fnvalues_to_columnvalues_to_column(name : String, values : List(Value)) : Result(Column, String)#

fnapply_sortapply_sort(df : DataFrame, keys : List((String, SortDir))) : Result(DataFrame, String)#

fncompare_rows_by_keyscompare_rows_by_keys(df : DataFrame, i : Int, j : Int, keys : List((String, SortDir))) : Int#

fncol_value_at_by_namecol_value_at_by_name(df : DataFrame, name : String, i : Int) : Value#

fndefault_csv_optsdefault_csv_opts() : CsvWriteOpts#

Default CSV write options: comma delimiter, double-quote, with header.

fnsplit_csv_linesplit_csv_line(line : String, delim : String) : List(String)#

fnfrom_csv_stringfrom_csv_string(s : String) : Result(DataFrame, String)#

Parse a CSV string into a DataFrame. Infers column types with widening.

fnparse_cellparse_cell(s : String) : Value#

fnto_csv_stringto_csv_string(df : DataFrame) : String#

Serialize a DataFrame to a CSV string. Uses default options.

fnto_csv_string_optsto_csv_string_opts(df : DataFrame, opts : CsvWriteOpts) : String#

Serialize a DataFrame to a CSV string with custom options.

fndata_rows_to_stringsdata_rows_to_strings(df : DataFrame, delim : String, quote : String) : List(String)#

fncsv_row_to_stringcsv_row_to_string(cells : List(String), delim : String, quote : String) : String#

fncsv_quote_cellcsv_quote_cell(cell : String, delim : String, quote : String) : String#

fnfrom_json_stringfrom_json_string(s : String) : Result(DataFrame, String)#

Parse a JSON string (array of objects) into a DataFrame.

fnjson_array_to_dfjson_array_to_df(items : List(JsonValue)) : Result(DataFrame, String)#

fnjson_value_to_valuejson_value_to_value(jv : JsonValue) : Value#

fnto_json_stringto_json_string(df : DataFrame) : String#

Serialize a DataFrame to a JSON string (array of objects).

fnvalue_to_jsonvalue_to_json(v : Value) : JsonValue#

fngroup_key_hashgroup_key_hash(gk : GroupKey) : Int#

fngroup_key_eqgroup_key_eq(a : GroupKey, b : GroupKey) : Bool#

fngroup_bygroup_by(df : DataFrame, group_cols : List(String)) : GroupedFrame#

Groups a DataFrame by the specified columns.

fnaggagg(gf : GroupedFrame, exprs : List(AggExpr)) : Result(DataFrame, String)#

Aggregates a GroupedFrame using the given expressions. Returns a new DataFrame.

fnmake_group_keymake_group_key(df : DataFrame, row_idx : Int, group_cols : List(String)) : GroupKey#

fnagg_output_nameagg_output_name(expr : AggExpr, existing_names : List(String)) : String#

fnagg_base_nameagg_base_name(expr : AggExpr) : String#

fnagg_type_suffixagg_type_suffix(expr : AggExpr) : String#

fnapply_group_byapply_group_by(df : DataFrame, group_cols : List(String), agg_exprs : List(AggExpr)) : Result(DataFrame, String)#

fnfilter_col_by_indices_dffilter_col_by_indices_df(df : DataFrame, indices : List(Int)) : DataFrame#

fneval_agg_exprseval_agg_exprs(sub_df : DataFrame, exprs : List(AggExpr), group_cols : List(String)) : List((String, Value))#

fneval_aggeval_agg(sub_df : DataFrame, expr : AggExpr, _out_name : String) : Value#

fnvalue_countsvalue_counts(df : DataFrame, col : String) : Result(DataFrame, String)#

Count frequency of each distinct value. Returns DataFrame with columns [col_name, 'count'] sorted by count desc.

fnapply_joinapply_join(left : DataFrame, right : DataFrame, on_cols : List(String), kind : JoinKind) : Result(DataFrame, String)#

fnapply_left_joinapply_left_join(left : DataFrame, right : DataFrame, on_cols : List(String)) : Result(DataFrame, String)#

fnapply_right_joinapply_right_join(left : DataFrame, right : DataFrame, on_cols : List(String)) : Result(DataFrame, String)#

fnapply_outer_joinapply_outer_join(left : DataFrame, right : DataFrame, on_cols : List(String)) : Result(DataFrame, String)#

fnapply_inner_joinapply_inner_join(left : DataFrame, right : DataFrame, on_cols : List(String)) : Result(DataFrame, String)#

fncol_describecol_describe(df : DataFrame) : List(ColStats)#

Returns summary statistics for each column.

fncol_describe_columncol_describe_column(col : Column) : ColStats#

fncol_z_scorecol_z_score(col : Column) : Result(Column, String)#

Z-score normalize an IntCol or FloatCol. Returns FloatCol.

fncol_normalizecol_normalize(col : Column) : Result(Column, String)#

Min-max normalize a column to [0, 1]. Returns FloatCol.

fnsummarizesummarize(df : DataFrame) : DataFrame#

Columns in the result: "column", "type", "count", "mean", "std", "min", "p25", "median", "p75", "max". Non-numeric columns get NullVal for every numeric stat.

Example: let df = DataFrame.make_df([IntCol("x", [1,2,3,4,5])]) let d = DataFrame.summarize(df) -- d has 1 row: column="x", type="Int", count=5, mean=3.0, ...

fnsamplesample(df : DataFrame, n : Int) : DataFrame#

Selects n rows at evenly-spaced positions across the DataFrame. Returns the full DataFrame unchanged when n >= row_count(df).

Example: let df = DataFrame.make_df([IntCol("v", [0,1,2,3,4,5,6,7,8,9])]) let s = DataFrame.sample(df, 3) -- picks rows 0, 3, 6 -> IntCol("v", [0, 3, 6])

fntrain_test_splittrain_test_split(df : DataFrame, ratio : Float) : (DataFrame, DataFrame)#

ratio is the fraction of rows placed in the training set (0.0 < ratio < 1.0). The first floor(row_count * ratio) rows become the training set; the remainder become the test set. Row order is preserved.

Example: let (train, test) = DataFrame.train_test_split(df, 0.8) -- 80% of rows → train, 20% → test

fncol_add_floatcol_add_float(col : Column, f : Float) : Result(Column, String)#

Returns a FloatCol. Errors on other column types.

Example: let c = FloatCol("price", [1.0, 2.0, 3.0]) let c2 = DataFrame.col_add_float(c, 10.0) -- Ok(FloatCol("price", [11.0, 12.0, 13.0]))

fncol_mul_floatcol_mul_float(col : Column, f : Float) : Result(Column, String)#

Returns a FloatCol. Errors on other column types.

Example: let c = IntCol("qty", [1, 2, 3]) let c2 = DataFrame.col_mul_float(c, 2.5) -- Ok(FloatCol("qty", [2.5, 5.0, 7.5]))

fncol_add_colcol_add_col(col1 : Column, col2 : Column) : Result(Column, String)#

Int + Int → IntCol (named after col1)
Float + Float, Int + Float, Float + Int → FloatCol (named after col1)

Errors if the columns have different lengths or are non-numeric.

Example: let a = IntCol("a", [1, 2, 3]) let b = IntCol("b", [10, 20, 30]) let sum = DataFrame.col_add_col(a, b) -- Ok(IntCol("a", [11, 22, 33]))

fncol_has_null_atcol_has_null_at(col : Column, i : Int) : Bool#

fndrop_nullsdrop_nulls(df : DataFrame) : DataFrame#

Remove rows that have a null in any column.

fndrop_nulls_indrop_nulls_in(df : DataFrame, names : List(String)) : DataFrame#

Remove rows that have a null in any of the specified columns.

fnfill_nullfill_null(col : Column, fill_val : Value) : Result(Column, String)#

Replace null values in a nullable column with fill_val. Errors on type mismatch.

fnfill_null_dffill_null_df(df : DataFrame, col_nm : String, fill_val : Value) : Result(DataFrame, String)#

Apply fill_null to a named column in a DataFrame, replacing it in-place.

fnfill_null_forwardfill_null_forward(col : Column) : Column#

Forward-fill nulls in a nullable column: propagate the last non-null value downward.

fnfill_null_backwardfill_null_backward(col : Column) : Column#

Backward-fill nulls in a nullable column: propagate the next non-null value upward.

fnsort_pairs_ascsort_pairs_asc(pairs : List((Int, Int))) : List((Int, Int))#

fnwindowwindow(df : DataFrame, expr : WindowExpr, out_col : String) : Result(DataFrame, String)#

a new column named out_col. No partitioning (operates over all rows).

fnmeltmelt(df : DataFrame, id_vars : List(String), value_vars : List(String),#

id_vars — columns kept as-is in every output row. value_vars — columns whose names become values of var_col and whose values become values of val_col. One output row is produced per (input row × value_var).

fnpivotpivot(df : DataFrame, index_col : String, cols_col : String,#

index_col — column whose distinct values become output rows. cols_col — column whose distinct values become new output column names. vals_col — column whose values fill the output cells. Missing (index, col) combinations get NullVal.

fnto_htmlto_html(df : DataFrame) : String#

Render a DataFrame as an HTML table string. The output is detected automatically by the March notebook and displayed as a styled table rather than raw text.

fnprint_tableprint_table(df : DataFrame) : Unit#

Print a DataFrame as a simple text table.