From cfa19caabfc014c40275e3254170789b12435a3c Mon Sep 17 00:00:00 2001 From: Greg Felice Date: Thu, 2 Jul 2026 09:26:47 -0400 Subject: [PATCH 1/2] Fix segfault and out-of-bounds reads in file loaders on malformed rows load_edges_from_file() and load_labels_from_file() build their COPY parser with only format=csv and header=false, so COPY uses its default comma delimiter. A file delimited by anything else (or a malformed row) then parses with an unexpected column count, and the loaders indexed the parsed fields without validating that count: - process_edge_row() reads the four fixed fields fields[0..3] unconditionally. A non-comma-delimited edge file parses as a single column, so fields[1..3] are out of bounds -> segfault (issue #2449). - create_agtype_from_list()/_i() pair header[i] with fields[i] for all i < nfields, so a row with more fields than the header reads header[i] out of bounds. Add bounds validation that turns these into clear errors: - Edge header must have >= 4 columns; a smaller count almost always means the wrong delimiter, so the error carries a hint. - Each edge row must have >= 4 columns and no more than the header's. - Each label row must have no more than the header's column count. Rows with fewer trailing columns than the header remain allowed, matching existing behavior (exercised by the existing conversion tests). This closes the segfault and out-of-bounds reads. The silent mis-parsing of a non-comma file whose header and rows share the same (wrong) column count is not detectable here; adding a delimiter option to the load functions is a separate follow-up. Adds a regression test in age_load using a pipe-delimited edge file. Addresses #2449. --- regress/age_load/data/bad_delim_edges.csv | 2 ++ regress/expected/age_load.out | 43 +++++++++++++++++++++++ regress/sql/age_load.sql | 15 ++++++++ src/backend/utils/load/ag_load_edges.c | 39 ++++++++++++++++++++ src/backend/utils/load/ag_load_labels.c | 17 +++++++++ 5 files changed, 116 insertions(+) create mode 100644 regress/age_load/data/bad_delim_edges.csv diff --git a/regress/age_load/data/bad_delim_edges.csv b/regress/age_load/data/bad_delim_edges.csv new file mode 100644 index 000000000..c72170a6a --- /dev/null +++ b/regress/age_load/data/bad_delim_edges.csv @@ -0,0 +1,2 @@ +start_id|start_vertex_type|end_id|end_vertex_type +1|V|2|V diff --git a/regress/expected/age_load.out b/regress/expected/age_load.out index 1f76c31ce..b4dd0492b 100644 --- a/regress/expected/age_load.out +++ b/regress/expected/age_load.out @@ -454,6 +454,49 @@ NOTICE: graph "agload_conversion" has been dropped (1 row) +-- +-- Issue 2449: mis-delimited / malformed load files must fail with a clear +-- error instead of segfaulting or silently corrupting data. Edge files +-- require the 4 fixed columns; a file that is not comma-delimited parses as +-- a single column, so this must be rejected at the header. +-- +SELECT create_graph('agload_delim'); +NOTICE: graph "agload_delim" has been created + create_graph +-------------- + +(1 row) + +SELECT create_vlabel('agload_delim', 'V'); +NOTICE: VLabel "V" has been created + create_vlabel +--------------- + +(1 row) + +SELECT create_elabel('agload_delim', 'E'); +NOTICE: ELabel "E" has been created + create_elabel +--------------- + +(1 row) + +-- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv'); +ERROR: edge file must have at least 4 columns (start_id, start_vertex_type, end_id, end_vertex_type), but the header has 1 +HINT: load_edges_from_file expects a comma-delimited CSV; check the file's delimiter. +SELECT drop_graph('agload_delim', true); +NOTICE: drop cascades to 4 other objects +DETAIL: drop cascades to table agload_delim._ag_label_vertex +drop cascades to table agload_delim._ag_label_edge +drop cascades to table agload_delim."V" +drop cascades to table agload_delim."E" +NOTICE: graph "agload_delim" has been dropped + drop_graph +------------ + +(1 row) + -- -- Test security and permissions -- diff --git a/regress/sql/age_load.sql b/regress/sql/age_load.sql index 976f050af..5f8ef92eb 100644 --- a/regress/sql/age_load.sql +++ b/regress/sql/age_load.sql @@ -194,6 +194,21 @@ SELECT load_edges_from_file('agload_conversion', 'Edges1', '../../etc/passwd', t -- SELECT drop_graph('agload_conversion', true); +-- +-- Issue 2449: mis-delimited / malformed load files must fail with a clear +-- error instead of segfaulting or silently corrupting data. Edge files +-- require the 4 fixed columns; a file that is not comma-delimited parses as +-- a single column, so this must be rejected at the header. +-- +SELECT create_graph('agload_delim'); +SELECT create_vlabel('agload_delim', 'V'); +SELECT create_elabel('agload_delim', 'E'); + +-- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv'); + +SELECT drop_graph('agload_delim', true); + -- -- Test security and permissions -- diff --git a/src/backend/utils/load/ag_load_edges.c b/src/backend/utils/load/ag_load_edges.c index c05bf3352..51dc9e3e1 100644 --- a/src/backend/utils/load/ag_load_edges.c +++ b/src/backend/utils/load/ag_load_edges.c @@ -56,6 +56,24 @@ static void process_edge_row(char **fields, int nfields, char *end_vertex_type; agtype *edge_properties; + /* + * Guard the fixed fields[0..3] accesses below and the header[i]/fields[i] + * pairing in create_agtype_from_list_i() against out-of-bounds reads on + * malformed or mis-delimited rows. A row must have at least the 4 fixed + * columns and no more columns than the header (rows with fewer trailing + * property columns than the header are allowed, matching existing + * behavior). A single-column row from a non-comma-delimited file is + * rejected here (previously it segfaulted). + */ + if (nfields < 4 || nfields > header_count) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("edge file row has %d columns; expected at least 4 " + "and no more than the header's %d", + nfields, header_count))); + } + /* Generate edge ID */ entry_id = nextval_internal(label_seq_relid, true); edge_id = make_graphid(label_id, entry_id); @@ -219,6 +237,27 @@ int create_edges_from_csv_file(char *file_path, header[i] = trim_whitespace(fields[i]); } + /* + * Edge files require the four fixed columns start_id, + * start_vertex_type, end_id and end_vertex_type. A smaller + * count almost always means the file is not comma-delimited + * (COPY defaults to comma). Fail clearly here instead of + * reading past the parsed fields in process_edge_row(), which + * previously caused a segfault. + */ + if (header_count < 4) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("edge file must have at least 4 columns " + "(start_id, start_vertex_type, end_id, " + "end_vertex_type), but the header has %d", + header_count), + errhint("load_edges_from_file expects a " + "comma-delimited CSV; check the file's " + "delimiter."))); + } + is_first_row = false; } else diff --git a/src/backend/utils/load/ag_load_labels.c b/src/backend/utils/load/ag_load_labels.c index 5b11f68b8..9698c784e 100644 --- a/src/backend/utils/load/ag_load_labels.c +++ b/src/backend/utils/load/ag_load_labels.c @@ -46,6 +46,23 @@ static void process_vertex_row(char **fields, int nfields, TupleTableSlot *slot; agtype *vertex_properties; + /* + * Guard the header[i]/fields[i] pairing in create_agtype_from_list() + * against out-of-bounds reads on malformed rows that have more fields + * than the header. Rows with fewer fields than the header are allowed + * (matching existing behavior). Note: a file delimited by something + * other than comma is parsed as a single column throughout, so header + * and rows still match and the data lands in properties verbatim -- + * specifying the delimiter is the separate fix for that. + */ + if (nfields > header_count) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("label file row has %d columns, more than the " + "header's %d", nfields, header_count))); + } + /* Generate or use provided entry_id */ if (id_field_exists) { From a431fc55a8d055317bba02bcbe8ccc881419521a Mon Sep 17 00:00:00 2001 From: Greg Felice Date: Thu, 2 Jul 2026 12:44:58 -0400 Subject: [PATCH 2/2] loader guards: clarify error wording and add per-row regression coverage Address review feedback on the nfields guards: - Error messages now say "the header's %d columns" (was "the header's %d"), making the count's unit explicit. - Add regression cases exercising the per-row guards, which previously only had coverage for the mis-delimited-header path: * an edge row with fewer than 4 columns * an edge row with more columns than the header * a label row with more columns than the header Each asserts a clean ERROR (these were the out-of-bounds reads the guards now catch). --- regress/age_load/data/edges_long_row.csv | 2 ++ regress/age_load/data/edges_short_row.csv | 2 ++ regress/age_load/data/labels_long_row.csv | 2 ++ regress/expected/age_load.out | 16 +++++++++++++++- regress/sql/age_load.sql | 16 +++++++++++++++- src/backend/utils/load/ag_load_edges.c | 2 +- src/backend/utils/load/ag_load_labels.c | 2 +- 7 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 regress/age_load/data/edges_long_row.csv create mode 100644 regress/age_load/data/edges_short_row.csv create mode 100644 regress/age_load/data/labels_long_row.csv diff --git a/regress/age_load/data/edges_long_row.csv b/regress/age_load/data/edges_long_row.csv new file mode 100644 index 000000000..2036f534a --- /dev/null +++ b/regress/age_load/data/edges_long_row.csv @@ -0,0 +1,2 @@ +start_id,start_vertex_type,end_id,end_vertex_type +1,V,2,V,extra diff --git a/regress/age_load/data/edges_short_row.csv b/regress/age_load/data/edges_short_row.csv new file mode 100644 index 000000000..e307927b3 --- /dev/null +++ b/regress/age_load/data/edges_short_row.csv @@ -0,0 +1,2 @@ +start_id,start_vertex_type,end_id,end_vertex_type +1,V diff --git a/regress/age_load/data/labels_long_row.csv b/regress/age_load/data/labels_long_row.csv new file mode 100644 index 000000000..72ec2a305 --- /dev/null +++ b/regress/age_load/data/labels_long_row.csv @@ -0,0 +1,2 @@ +id,name +1,Alice,extra diff --git a/regress/expected/age_load.out b/regress/expected/age_load.out index b4dd0492b..17c5ecc27 100644 --- a/regress/expected/age_load.out +++ b/regress/expected/age_load.out @@ -481,10 +481,24 @@ NOTICE: ELabel "E" has been created (1 row) --- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault) +-- pipe-delimited edge file -> parses to 1 column -> clean error at the header +-- (was a segfault) SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv'); ERROR: edge file must have at least 4 columns (start_id, start_vertex_type, end_id, end_vertex_type), but the header has 1 HINT: load_edges_from_file expects a comma-delimited CSV; check the file's delimiter. +-- per-row guards (header is valid, but an individual data row is ragged): +-- an edge row with fewer than 4 columns -> clean error (was an OOB read of +-- the fixed fields[1..3]) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_short_row.csv'); +ERROR: edge file row has 2 columns; expected at least 4 and no more than the header's 4 columns +-- an edge row with more columns than the header -> clean error (was an OOB +-- read of header[i] in create_agtype_from_list_i) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_long_row.csv'); +ERROR: edge file row has 5 columns; expected at least 4 and no more than the header's 4 columns +-- a label row with more columns than the header -> clean error (was an OOB +-- read of header[i] in create_agtype_from_list) +SELECT load_labels_from_file('agload_delim', 'V', 'age_load/labels_long_row.csv'); +ERROR: label file row has 3 columns, more than the header's 2 columns SELECT drop_graph('agload_delim', true); NOTICE: drop cascades to 4 other objects DETAIL: drop cascades to table agload_delim._ag_label_vertex diff --git a/regress/sql/age_load.sql b/regress/sql/age_load.sql index 5f8ef92eb..196b09806 100644 --- a/regress/sql/age_load.sql +++ b/regress/sql/age_load.sql @@ -204,9 +204,23 @@ SELECT create_graph('agload_delim'); SELECT create_vlabel('agload_delim', 'V'); SELECT create_elabel('agload_delim', 'E'); --- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault) +-- pipe-delimited edge file -> parses to 1 column -> clean error at the header +-- (was a segfault) SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv'); +-- per-row guards (header is valid, but an individual data row is ragged): +-- an edge row with fewer than 4 columns -> clean error (was an OOB read of +-- the fixed fields[1..3]) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_short_row.csv'); + +-- an edge row with more columns than the header -> clean error (was an OOB +-- read of header[i] in create_agtype_from_list_i) +SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_long_row.csv'); + +-- a label row with more columns than the header -> clean error (was an OOB +-- read of header[i] in create_agtype_from_list) +SELECT load_labels_from_file('agload_delim', 'V', 'age_load/labels_long_row.csv'); + SELECT drop_graph('agload_delim', true); -- diff --git a/src/backend/utils/load/ag_load_edges.c b/src/backend/utils/load/ag_load_edges.c index 51dc9e3e1..01585bab0 100644 --- a/src/backend/utils/load/ag_load_edges.c +++ b/src/backend/utils/load/ag_load_edges.c @@ -70,7 +70,7 @@ static void process_edge_row(char **fields, int nfields, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("edge file row has %d columns; expected at least 4 " - "and no more than the header's %d", + "and no more than the header's %d columns", nfields, header_count))); } diff --git a/src/backend/utils/load/ag_load_labels.c b/src/backend/utils/load/ag_load_labels.c index 9698c784e..236d47a1d 100644 --- a/src/backend/utils/load/ag_load_labels.c +++ b/src/backend/utils/load/ag_load_labels.c @@ -60,7 +60,7 @@ static void process_vertex_row(char **fields, int nfields, ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("label file row has %d columns, more than the " - "header's %d", nfields, header_count))); + "header's %d columns", nfields, header_count))); } /* Generate or use provided entry_id */