Skip to content

Commit ac22fb5

Browse files
authored
Merge pull request #258 from oree-xx/Wikipedia-automation
Add a completion function.
2 parents 3b100a9 + b5c80e0 commit ac22fb5

File tree

3 files changed

+32
-45
lines changed

3 files changed

+32
-45
lines changed

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,18 @@ def parse_arguments():
6363
return args
6464

6565

66+
def check_for_completion():
67+
try:
68+
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
69+
reader = csv.DictReader(file_obj, dialect="unix")
70+
if len(list(reader)) > 300:
71+
raise shared.QuantifyingException(
72+
f"Data fetch completed for {QUARTER}", 0
73+
)
74+
except FileNotFoundError:
75+
pass # File may not be found without --enable-save, etc.
76+
77+
6678
def write_data(args, tool_data):
6779
if not args.enable_save:
6880
return args
@@ -157,6 +169,7 @@ def query_wikipedia_languages(session):
157169
def main():
158170
args = parse_arguments()
159171
shared.paths_log(LOGGER, PATHS)
172+
check_for_completion()
160173
shared.git_fetch_and_merge(args, PATHS["repo"])
161174
session = shared.get_session()
162175
tool_data = query_wikipedia_languages(session)

scripts/2-process/github_process.py

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
# import pandas as pd
1514
import pandas as pd
1615

1716
# Add parent directory so shared can be imported
@@ -60,6 +59,13 @@ def parse_arguments():
6059
return args
6160

6261

62+
def check_for_data_file(file_path):
63+
if os.path.exists(file_path):
64+
raise shared.QuantifyingException(
65+
f"Processed data already exists for {QUARTER}", 0
66+
)
67+
68+
6369
def data_to_csv(args, data, file_path):
6470
if not args.enable_save:
6571
return
@@ -92,6 +98,7 @@ def process_totals_by_license(args, count_data):
9298
file_path = shared.path_join(
9399
PATHS["data_phase"], "github_totals_by_license.csv"
94100
)
101+
check_for_data_file(file_path)
95102
data_to_csv(args, data, file_path)
96103

97104

@@ -126,52 +133,10 @@ def process_totals_by_restriction(args, count_data):
126133
file_path = shared.path_join(
127134
PATHS["data_phase"], "github_totals_by_restriction.csv"
128135
)
136+
check_for_data_file(file_path)
129137
data_to_csv(args, data, file_path)
130138

131139

132-
# def load_quarter_data(quarter):
133-
# """
134-
# Load data for a specific quarter.
135-
# """
136-
# file_path = os.path.join(PATHS["data"], f"{quarter}",
137-
# "1-fetch", "github_fetched")
138-
# if not os.path.exists(file_path):
139-
# LOGGER.error(f"Data file for quarter {quarter} not found.")
140-
# return None
141-
# return pd.read_csv(file_path)
142-
143-
144-
# def compare_data(current_quarter, previous_quarter):
145-
# """
146-
# Compare data between two quarters.
147-
# """
148-
# current_data = load_quarter_data(current_quarter)
149-
# previous_data = load_quarter_data(previous_quarter)
150-
151-
# if current_data is None or previous_data is None:
152-
# return
153-
154-
# Process data to compare totals
155-
156-
157-
# def parse_arguments():
158-
# """
159-
# Parses command-line arguments, returns parsed arguments.
160-
# """
161-
# LOGGER.info("Parsing command-line arguments")
162-
# parser = argparse.ArgumentParser(
163-
# description="Google Custom Search Comparison Report")
164-
# parser.add_argument(
165-
# "--current_quarter", type=str, required=True,
166-
# help="Current quarter for comparison (e.g., 2024Q3)"
167-
# )
168-
# parser.add_argument(
169-
# "--previous_quarter", type=str, required=True,
170-
# help="Previous quarter for comparison (e.g., 2024Q2)"
171-
# )
172-
# return parser.parse_args()
173-
174-
175140
def main():
176141
args = parse_arguments()
177142
shared.paths_log(LOGGER, PATHS)

scripts/2-process/wikipedia_process.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@ def parse_arguments():
6363
return args
6464

6565

66+
def check_for_data_file(file_path):
67+
if os.path.exists(file_path):
68+
raise shared.QuantifyingException(
69+
f"Processed data already exists for {QUARTER}", 0
70+
)
71+
72+
6673
def data_to_csv(args, data, file_path):
6774
if not args.enable_save:
6875
return
@@ -91,6 +98,7 @@ def process_highest_language_usage(args, count_data):
9198
file_path = shared.path_join(
9299
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
93100
)
101+
check_for_data_file(file_path)
94102
data_to_csv(args, top_10, file_path)
95103

96104

@@ -114,6 +122,7 @@ def process_least_language_usage(args, count_data):
114122
file_path = shared.path_join(
115123
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
116124
)
125+
check_for_data_file(file_path)
117126
data_to_csv(args, bottom_10, file_path)
118127

119128

@@ -140,14 +149,14 @@ def process_language_representation(args, count_data):
140149
file_path = shared.path_join(
141150
PATHS["data_phase"], "wikipedia_language_representation.csv"
142151
)
152+
check_for_data_file(file_path)
143153
data_to_csv(args, language_counts, file_path)
144154

145155

146156
def main():
147157
args = parse_arguments()
148158
shared.paths_log(LOGGER, PATHS)
149159
shared.git_fetch_and_merge(args, PATHS["repo"])
150-
151160
file_count = shared.path_join(
152161
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
153162
)

0 commit comments

Comments
 (0)