@@ -57,6 +57,8 @@ def __init__(self, load_source, name=None, resources=None, strip=True, limit_row
5757 override_schema = None , override_fields = None ,
5858 extract_missing_values = None ,
5959 deduplicate_headers = False ,
60+ deduplicate_headers_case_sensitive = True ,
61+ deduplicate_headers_format = ' (%s)' ,
6062 on_error = raise_exception ,
6163 ** options ):
6264 super (load , self ).__init__ ()
@@ -70,6 +72,8 @@ def __init__(self, load_source, name=None, resources=None, strip=True, limit_row
7072 self .override_schema = override_schema
7173 self .override_fields = override_fields
7274 self .deduplicate_headers = deduplicate_headers
75+ self .deduplicate_headers_case_sensitive = deduplicate_headers_case_sensitive
76+ self .deduplicate_headers_format = deduplicate_headers_format
7377
7478 # Extract missing values
7579 self .extract_missing_values = None
@@ -180,12 +184,21 @@ def safe_process_datapackage(self, dp: Package):
180184 self .options .setdefault ('headers' , 1 )
181185 self .options .setdefault ('sample_size' , 1000 )
182186 stream : Stream = Stream (self .load_source , ** self .options ).open ()
183- if len (stream .headers ) != len (set (stream .headers )):
187+ if self .deduplicate_headers_case_sensitive :
188+ duplication_test = len (stream .headers ) != len (set (stream .headers ))
189+ else :
190+ lower_headers = [header .lower () for header in stream .headers ]
191+ duplication_test = len (lower_headers ) != len (set (lower_headers ))
192+ # duplication_test = len(stream.headers) != len(set(stream.headers))
193+ if duplication_test :
184194 if not self .deduplicate_headers :
185195 raise ValueError (
186196 'Found duplicate headers.' +
187197 'Use the `deduplicate_headers` flag (found headers=%r)' % stream .headers )
188- stream .headers = self .rename_duplicate_headers (stream .headers )
198+ stream .headers = self .rename_duplicate_headers (
199+ stream .headers , case_sensitive = self .deduplicate_headers_case_sensitive ,
200+ deduplicate_format = self .deduplicate_headers_format
201+ )
189202 schema = Schema (self .override_schema or {}).infer (
190203 stream .sample , headers = stream .headers ,
191204 confidence = 1 , guesser_cls = self .guesser )
@@ -269,15 +282,21 @@ def process_resources(self, resources):
269282 yield it
270283
271284 @staticmethod
272- def rename_duplicate_headers (duplicate_headers ):
285+ def rename_duplicate_headers (duplicate_headers , case_sensitive = True , deduplicate_format = ' (%s)' ):
273286 counter = {}
274287 headers = []
288+ header_keys = []
275289 for header in duplicate_headers :
276- counter .setdefault (header , 0 )
277- counter [header ] += 1
278- if counter [header ] > 1 :
279- if counter [header ] == 2 :
280- headers [headers .index (header )] = '%s (%s)' % (header , 1 )
281- header = '%s (%s)' % (header , counter [header ])
290+ header_key = header
291+ header_keys .append (header_key )
292+ if not case_sensitive :
293+ header_key = header_key .lower ()
294+ counter .setdefault (header_key , 0 )
295+ counter [header_key ] += 1
296+ if counter [header_key ] > 1 :
297+ if counter [header_key ] == 2 :
298+ prev_index = header_keys .index (header_key )
299+ headers [prev_index ] = ('%s' + deduplicate_format ) % (headers [prev_index ], 1 )
300+ header = ('%s' + deduplicate_format ) % (header , counter [header_key ])
282301 headers .append (header )
283302 return headers
0 commit comments