From cfa19caabfc014c40275e3254170789b12435a3c Mon Sep 17 00:00:00 2001
From: Greg Felice <gregfelice@gmail.com>
Date: Thu, 2 Jul 2026 09:26:47 -0400
Subject: [PATCH 1/2] Fix segfault and out-of-bounds reads in file loaders on
 malformed rows

load_edges_from_file() and load_labels_from_file() build their COPY
parser with only format=csv and header=false, so COPY uses its default
comma delimiter. A file delimited by anything else (or a malformed row)
then parses with an unexpected column count, and the loaders indexed the
parsed fields without validating that count:

- process_edge_row() reads the four fixed fields fields[0..3]
  unconditionally. A non-comma-delimited edge file parses as a single
  column, so fields[1..3] are out of bounds -> segfault (issue #2449).
- create_agtype_from_list()/_i() pair header[i] with fields[i] for all
  i < nfields, so a row with more fields than the header reads header[i]
  out of bounds.

Add bounds validation that turns these into clear errors:

- Edge header must have >= 4 columns; a smaller count almost always
  means the wrong delimiter, so the error carries a hint.
- Each edge row must have >= 4 columns and no more than the header's.
- Each label row must have no more than the header's column count.

Rows with fewer trailing columns than the header remain allowed, matching
existing behavior (exercised by the existing conversion tests).

This closes the segfault and out-of-bounds reads. The silent mis-parsing
of a non-comma file whose header and rows share the same (wrong) column
count is not detectable here; adding a delimiter option to the load
functions is a separate follow-up.

Adds a regression test in age_load using a pipe-delimited edge file.

Addresses #2449.
---
 regress/age_load/data/bad_delim_edges.csv |  2 ++
 regress/expected/age_load.out             | 43 +++++++++++++++++++++++
 regress/sql/age_load.sql                  | 15 ++++++++
 src/backend/utils/load/ag_load_edges.c    | 39 ++++++++++++++++++++
 src/backend/utils/load/ag_load_labels.c   | 17 +++++++++
 5 files changed, 116 insertions(+)
 create mode 100644 regress/age_load/data/bad_delim_edges.csv

diff --git a/regress/age_load/data/bad_delim_edges.csv b/regress/age_load/data/bad_delim_edges.csv
new file mode 100644
index 000000000..c72170a6a
--- /dev/null
+++ b/regress/age_load/data/bad_delim_edges.csv
@@ -0,0 +1,2 @@
+start_id|start_vertex_type|end_id|end_vertex_type
+1|V|2|V
diff --git a/regress/expected/age_load.out b/regress/expected/age_load.out
index 1f76c31ce..b4dd0492b 100644
--- a/regress/expected/age_load.out
+++ b/regress/expected/age_load.out
@@ -454,6 +454,49 @@ NOTICE:  graph "agload_conversion" has been dropped
  
 (1 row)
 
+--
+-- Issue 2449: mis-delimited / malformed load files must fail with a clear
+-- error instead of segfaulting or silently corrupting data. Edge files
+-- require the 4 fixed columns; a file that is not comma-delimited parses as
+-- a single column, so this must be rejected at the header.
+--
+SELECT create_graph('agload_delim');
+NOTICE:  graph "agload_delim" has been created
+ create_graph 
+--------------
+ 
+(1 row)
+
+SELECT create_vlabel('agload_delim', 'V');
+NOTICE:  VLabel "V" has been created
+ create_vlabel 
+---------------
+ 
+(1 row)
+
+SELECT create_elabel('agload_delim', 'E');
+NOTICE:  ELabel "E" has been created
+ create_elabel 
+---------------
+ 
+(1 row)
+
+-- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault)
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv');
+ERROR:  edge file must have at least 4 columns (start_id, start_vertex_type, end_id, end_vertex_type), but the header has 1
+HINT:  load_edges_from_file expects a comma-delimited CSV; check the file's delimiter.
+SELECT drop_graph('agload_delim', true);
+NOTICE:  drop cascades to 4 other objects
+DETAIL:  drop cascades to table agload_delim._ag_label_vertex
+drop cascades to table agload_delim._ag_label_edge
+drop cascades to table agload_delim."V"
+drop cascades to table agload_delim."E"
+NOTICE:  graph "agload_delim" has been dropped
+ drop_graph 
+------------
+ 
+(1 row)
+
 --
 -- Test security and permissions
 --
diff --git a/regress/sql/age_load.sql b/regress/sql/age_load.sql
index 976f050af..5f8ef92eb 100644
--- a/regress/sql/age_load.sql
+++ b/regress/sql/age_load.sql
@@ -194,6 +194,21 @@ SELECT load_edges_from_file('agload_conversion', 'Edges1', '../../etc/passwd', t
 --
 SELECT drop_graph('agload_conversion', true);
 
+--
+-- Issue 2449: mis-delimited / malformed load files must fail with a clear
+-- error instead of segfaulting or silently corrupting data. Edge files
+-- require the 4 fixed columns; a file that is not comma-delimited parses as
+-- a single column, so this must be rejected at the header.
+--
+SELECT create_graph('agload_delim');
+SELECT create_vlabel('agload_delim', 'V');
+SELECT create_elabel('agload_delim', 'E');
+
+-- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault)
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv');
+
+SELECT drop_graph('agload_delim', true);
+
 --
 -- Test security and permissions
 --
diff --git a/src/backend/utils/load/ag_load_edges.c b/src/backend/utils/load/ag_load_edges.c
index c05bf3352..51dc9e3e1 100644
--- a/src/backend/utils/load/ag_load_edges.c
+++ b/src/backend/utils/load/ag_load_edges.c
@@ -56,6 +56,24 @@ static void process_edge_row(char **fields, int nfields,
     char *end_vertex_type;
     agtype *edge_properties;
 
+    /*
+     * Guard the fixed fields[0..3] accesses below and the header[i]/fields[i]
+     * pairing in create_agtype_from_list_i() against out-of-bounds reads on
+     * malformed or mis-delimited rows. A row must have at least the 4 fixed
+     * columns and no more columns than the header (rows with fewer trailing
+     * property columns than the header are allowed, matching existing
+     * behavior). A single-column row from a non-comma-delimited file is
+     * rejected here (previously it segfaulted).
+     */
+    if (nfields < 4 || nfields > header_count)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                 errmsg("edge file row has %d columns; expected at least 4 "
+                        "and no more than the header's %d",
+                        nfields, header_count)));
+    }
+
     /* Generate edge ID */
     entry_id = nextval_internal(label_seq_relid, true);
     edge_id = make_graphid(label_id, entry_id);
@@ -219,6 +237,27 @@ int create_edges_from_csv_file(char *file_path,
                     header[i] = trim_whitespace(fields[i]);
                 }
 
+                /*
+                 * Edge files require the four fixed columns start_id,
+                 * start_vertex_type, end_id and end_vertex_type. A smaller
+                 * count almost always means the file is not comma-delimited
+                 * (COPY defaults to comma). Fail clearly here instead of
+                 * reading past the parsed fields in process_edge_row(), which
+                 * previously caused a segfault.
+                 */
+                if (header_count < 4)
+                {
+                    ereport(ERROR,
+                            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                             errmsg("edge file must have at least 4 columns "
+                                    "(start_id, start_vertex_type, end_id, "
+                                    "end_vertex_type), but the header has %d",
+                                    header_count),
+                             errhint("load_edges_from_file expects a "
+                                     "comma-delimited CSV; check the file's "
+                                     "delimiter.")));
+                }
+
                 is_first_row = false;
             }
             else
diff --git a/src/backend/utils/load/ag_load_labels.c b/src/backend/utils/load/ag_load_labels.c
index 5b11f68b8..9698c784e 100644
--- a/src/backend/utils/load/ag_load_labels.c
+++ b/src/backend/utils/load/ag_load_labels.c
@@ -46,6 +46,23 @@ static void process_vertex_row(char **fields, int nfields,
     TupleTableSlot *slot;
     agtype *vertex_properties;
 
+    /*
+     * Guard the header[i]/fields[i] pairing in create_agtype_from_list()
+     * against out-of-bounds reads on malformed rows that have more fields
+     * than the header. Rows with fewer fields than the header are allowed
+     * (matching existing behavior). Note: a file delimited by something
+     * other than comma is parsed as a single column throughout, so header
+     * and rows still match and the data lands in properties verbatim --
+     * specifying the delimiter is the separate fix for that.
+     */
+    if (nfields > header_count)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                 errmsg("label file row has %d columns, more than the "
+                        "header's %d", nfields, header_count)));
+    }
+
     /* Generate or use provided entry_id */
     if (id_field_exists)
     {

From a431fc55a8d055317bba02bcbe8ccc881419521a Mon Sep 17 00:00:00 2001
From: Greg Felice <gregfelice@gmail.com>
Date: Thu, 2 Jul 2026 12:44:58 -0400
Subject: [PATCH 2/2] loader guards: clarify error wording and add per-row
 regression coverage

Address review feedback on the nfields guards:

- Error messages now say "the header's %d columns" (was "the header's %d"),
  making the count's unit explicit.
- Add regression cases exercising the per-row guards, which previously only
  had coverage for the mis-delimited-header path:
    * an edge row with fewer than 4 columns
    * an edge row with more columns than the header
    * a label row with more columns than the header
  Each asserts a clean ERROR (these were the out-of-bounds reads the guards
  now catch).
---
 regress/age_load/data/edges_long_row.csv  |  2 ++
 regress/age_load/data/edges_short_row.csv |  2 ++
 regress/age_load/data/labels_long_row.csv |  2 ++
 regress/expected/age_load.out             | 16 +++++++++++++++-
 regress/sql/age_load.sql                  | 16 +++++++++++++++-
 src/backend/utils/load/ag_load_edges.c    |  2 +-
 src/backend/utils/load/ag_load_labels.c   |  2 +-
 7 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 regress/age_load/data/edges_long_row.csv
 create mode 100644 regress/age_load/data/edges_short_row.csv
 create mode 100644 regress/age_load/data/labels_long_row.csv

diff --git a/regress/age_load/data/edges_long_row.csv b/regress/age_load/data/edges_long_row.csv
new file mode 100644
index 000000000..2036f534a
--- /dev/null
+++ b/regress/age_load/data/edges_long_row.csv
@@ -0,0 +1,2 @@
+start_id,start_vertex_type,end_id,end_vertex_type
+1,V,2,V,extra
diff --git a/regress/age_load/data/edges_short_row.csv b/regress/age_load/data/edges_short_row.csv
new file mode 100644
index 000000000..e307927b3
--- /dev/null
+++ b/regress/age_load/data/edges_short_row.csv
@@ -0,0 +1,2 @@
+start_id,start_vertex_type,end_id,end_vertex_type
+1,V
diff --git a/regress/age_load/data/labels_long_row.csv b/regress/age_load/data/labels_long_row.csv
new file mode 100644
index 000000000..72ec2a305
--- /dev/null
+++ b/regress/age_load/data/labels_long_row.csv
@@ -0,0 +1,2 @@
+id,name
+1,Alice,extra
diff --git a/regress/expected/age_load.out b/regress/expected/age_load.out
index b4dd0492b..17c5ecc27 100644
--- a/regress/expected/age_load.out
+++ b/regress/expected/age_load.out
@@ -481,10 +481,24 @@ NOTICE:  ELabel "E" has been created
  
 (1 row)
 
--- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault)
+-- pipe-delimited edge file -> parses to 1 column -> clean error at the header
+-- (was a segfault)
 SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv');
 ERROR:  edge file must have at least 4 columns (start_id, start_vertex_type, end_id, end_vertex_type), but the header has 1
 HINT:  load_edges_from_file expects a comma-delimited CSV; check the file's delimiter.
+-- per-row guards (header is valid, but an individual data row is ragged):
+-- an edge row with fewer than 4 columns -> clean error (was an OOB read of
+-- the fixed fields[1..3])
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_short_row.csv');
+ERROR:  edge file row has 2 columns; expected at least 4 and no more than the header's 4 columns
+-- an edge row with more columns than the header -> clean error (was an OOB
+-- read of header[i] in create_agtype_from_list_i)
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_long_row.csv');
+ERROR:  edge file row has 5 columns; expected at least 4 and no more than the header's 4 columns
+-- a label row with more columns than the header -> clean error (was an OOB
+-- read of header[i] in create_agtype_from_list)
+SELECT load_labels_from_file('agload_delim', 'V', 'age_load/labels_long_row.csv');
+ERROR:  label file row has 3 columns, more than the header's 2 columns
 SELECT drop_graph('agload_delim', true);
 NOTICE:  drop cascades to 4 other objects
 DETAIL:  drop cascades to table agload_delim._ag_label_vertex
diff --git a/regress/sql/age_load.sql b/regress/sql/age_load.sql
index 5f8ef92eb..196b09806 100644
--- a/regress/sql/age_load.sql
+++ b/regress/sql/age_load.sql
@@ -204,9 +204,23 @@ SELECT create_graph('agload_delim');
 SELECT create_vlabel('agload_delim', 'V');
 SELECT create_elabel('agload_delim', 'E');
 
--- pipe-delimited edge file -> parses to 1 column -> clean error (was a segfault)
+-- pipe-delimited edge file -> parses to 1 column -> clean error at the header
+-- (was a segfault)
 SELECT load_edges_from_file('agload_delim', 'E', 'age_load/bad_delim_edges.csv');
 
+-- per-row guards (header is valid, but an individual data row is ragged):
+-- an edge row with fewer than 4 columns -> clean error (was an OOB read of
+-- the fixed fields[1..3])
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_short_row.csv');
+
+-- an edge row with more columns than the header -> clean error (was an OOB
+-- read of header[i] in create_agtype_from_list_i)
+SELECT load_edges_from_file('agload_delim', 'E', 'age_load/edges_long_row.csv');
+
+-- a label row with more columns than the header -> clean error (was an OOB
+-- read of header[i] in create_agtype_from_list)
+SELECT load_labels_from_file('agload_delim', 'V', 'age_load/labels_long_row.csv');
+
 SELECT drop_graph('agload_delim', true);
 
 --
diff --git a/src/backend/utils/load/ag_load_edges.c b/src/backend/utils/load/ag_load_edges.c
index 51dc9e3e1..01585bab0 100644
--- a/src/backend/utils/load/ag_load_edges.c
+++ b/src/backend/utils/load/ag_load_edges.c
@@ -70,7 +70,7 @@ static void process_edge_row(char **fields, int nfields,
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                  errmsg("edge file row has %d columns; expected at least 4 "
-                        "and no more than the header's %d",
+                        "and no more than the header's %d columns",
                         nfields, header_count)));
     }
 
diff --git a/src/backend/utils/load/ag_load_labels.c b/src/backend/utils/load/ag_load_labels.c
index 9698c784e..236d47a1d 100644
--- a/src/backend/utils/load/ag_load_labels.c
+++ b/src/backend/utils/load/ag_load_labels.c
@@ -60,7 +60,7 @@ static void process_vertex_row(char **fields, int nfields,
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                  errmsg("label file row has %d columns, more than the "
-                        "header's %d", nfields, header_count)));
+                        "header's %d columns", nfields, header_count)));
     }
 
     /* Generate or use provided entry_id */