@@ -38,11 +38,25 @@ def save_audio_file(destination_path, audio_id, source_audio_extension='opus'):
3838 __save_file (url , local_path )
3939 print ('File {}.{} saved to {}' .format (audio_id , source_audio_extension , destination_path ))
4040
41- def __generate_date_list_in_isoformat (start , end ):
42- """ Generate list of date in iso format ending with `Z` """
43- delta = end - start
44- dates = [(start + datetime .timedelta (days = i )).replace (microsecond = 0 ).isoformat () + 'Z' for i in range (delta .days + 1 )]
45- return dates
41+ def __generate_date_in_isoformat (date ):
42+ """ Generate date in iso format ending with `Z` """
43+ return date .replace (microsecond = 0 ).isoformat () + 'Z'
44+
45+ def __get_all_segments (token , guardian_id , start , end ):
46+ all_segments = []
47+ empty_segment = False
48+ offset = 0
49+
50+ while not empty_segment :
51+ # No data will return `None` from server
52+ segments = guardianAudio (token , guardian_id , start , end , limit = 1000 , offset = offset , descending = False )
53+ if segments :
54+ all_segments .extend (segments )
55+ offset = offset + 1000
56+ else :
57+ empty_segment = True
58+
59+ return all_segments
4660
4761def __segmentDownload (audio_path , file_ext , segment ):
4862 audio_id = segment ['guid' ]
@@ -73,23 +87,24 @@ def downloadGuardianAudio(token, destination_path, guardian_id, min_date, max_da
7387 audio_path = destination_path + '/' + guardian_id
7488 if not os .path .exists (audio_path ):
7589 os .makedirs (audio_path )
76- dates = __generate_date_list_in_isoformat (min_date , max_date )
7790
78- for date in dates :
79- date_end = date .replace ('00:00:00' , '23:59:59' )
80- segments = guardianAudio (token , guardian_id , date , date_end , limit = 1000 , descending = False )
91+ start = __generate_date_in_isoformat (min_date )
92+ end = __generate_date_in_isoformat (max_date )
8193
82- if segments :
83- if (parallel ):
84- with concurrent .futures .ThreadPoolExecutor (max_workers = 100 ) as executor :
85- futures = []
86- for segment in segments :
87- futures .append (executor .submit (__segmentDownload , audio_path = audio_path , file_ext = file_ext , segment = segment ))
88-
89- futures , _ = concurrent .futures .wait (futures )
90- else :
94+ segments = __get_all_segments (token , guardian_id , start , end )
95+
96+ if segments :
97+ print ("Downloading {} audio from {}" .format (len (segments ), guardian_id ))
98+ if (parallel ):
99+ with concurrent .futures .ThreadPoolExecutor (max_workers = 100 ) as executor :
100+ futures = []
91101 for segment in segments :
92- __segmentDownload (audio_path , file_ext , segment )
93- print ("Finish download on" , guardian_id , date [:- 10 ])
102+ futures .append (executor .submit (__segmentDownload , audio_path = audio_path , file_ext = file_ext , segment = segment ))
103+
104+ futures , _ = concurrent .futures .wait (futures )
94105 else :
95- print ("No data on date:" , date [:- 10 ])
106+ for segment in segments :
107+ __segmentDownload (audio_path , file_ext , segment )
108+ print ("Finish download on {}" .format (guardian_id ))
109+ else :
110+ print ("No data found on {} - {} at {}" .format (start [:- 10 ], end [:- 10 ], guardian_id ))
0 commit comments