-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathquery.py
More file actions
390 lines (328 loc) · 13.7 KB
/
query.py
File metadata and controls
390 lines (328 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
from pprint import pprint
from requests import post
import os
from time import time
import cloudant
from threading import Thread
comment_threshold = 10
max_iterations = 20 # number of iterations that should run; -1 to keep going until all issues/prs fetched
# first in each tuple is
pull_rates = [(100, 3), (90, 7), (80, 9), (70, 12), (60, 15), (50, 20), (40, 25), (30, 35),
(25, 50), (20, 60), (18, 68), (16, 75), (14, 80), (12, 95), (10, 100)]
# get a list of known bots
with open("data/bots.txt") as file:
bots = file.read().splitlines()
# for timing how long it takes a function to run
def time_execution(function):
def wrapper(*args):
print(f"Timing {function.__name__}")
start_time = time()
value = function(*args)
end_time = time()
print(f"{function.__name__} took {round(end_time - start_time, 3)} seconds to run")
return value
return wrapper
# @time_execution # uncomment to time function
def run_query(auth, owner, repo, pull_type):
global max_iterations
discount = 0
print(f"Gathering {pull_type}...")
# db.createDatabase(db_name)
# final list to be returned
ipr_list = []
# stores the authorisation token and accept
headers = {
"Authorization": "token " + auth,
"Accept": "application/vnd.github+json",
}
# for pagination
has_next_page = True
cursor = None
above_threshold = True
# initial pull rate is (12, 100)
pr_index = -1
# bug with github graphql api with sorting by comments
# temporary "fix"
if pull_type == "pullRequests":
max_iterations = 1
pr_index = 0
i = 0
# query can only fetch at most 100 at a time, so keeps fetching until all fetched
while has_next_page and above_threshold and i != max_iterations:
i += 1
# forms the query and performs call, on subsequent iterations passes in cursor for pagination
query = get_comments_query(repo, owner, pull_type, pull_rates[pr_index], discount, cursor)
try:
request = post("https://api.github.com/graphql", json={"query": query}, headers=headers)
except Exception:
print("s")
print(f"error at iteration {i}")
i -= 1
continue
# if api call was successful, adds the comment to the comment list
if request.status_code == 200:
try:
# trims the result of the api call to remove unneeded nesting
trimmed_request = request.json()["data"]["repository"][pull_type]
except TypeError:
if ipr_list is not None:
pprint(ipr_list)
print("Invalid information provided")
break
# pprint(trimmed_request)
# determines if all issues/prs have been fetched
has_next_page = trimmed_request["pageInfo"]["hasNextPage"]
if has_next_page:
cursor = trimmed_request["pageInfo"]["endCursor"]
# checks if any of the issues/prs have fewer than threshold comments
# if so, remove them and don't fetch any more
last_count = trimmed_request["edges"][-1]["node"]["comments"]["totalCount"]
if last_count < comment_threshold:
above_threshold = False
for node in reversed(trimmed_request["edges"]):
if node["node"]["comments"]["totalCount"] < comment_threshold:
trimmed_request["edges"].pop()
else:
break
else:
# determine the pull rate for the next iteration
for j, rate in enumerate(pull_rates):
if last_count <= rate[1]:
if j != pr_index:
pr_index = j
discount = 0 # if pr_index changes, reset the discount
break
# loop through issues/prs
for edge in trimmed_request["edges"]:
# trim data
node = edge["node"]
if node["author"] is not None:
if node["author"]["name"] is not None:
node["author"] = node["author"]["name"]
else:
node["author"] = node["author"]["login"] # remove if more info about author needed
else:
node["author"] = "deletedUser"
node["comments"]["edges"] = filter_comments(node["comments"]["edges"])
# update the comment count
count = len(node["comments"]["edges"])
node["commentCount"] = count
still_above_threshold = count >= comment_threshold
# pull the rest of the comments if there are any
if node["comments"]["pageInfo"]["hasNextPage"]:
comments = get_other_comments(node["number"], repo, owner, pull_type[0:-1],
headers, node["comments"]["pageInfo"]["endCursor"])
# add comments to exiting ones, update commentCount, if under threshold comments, remove
if count + len(comments) >= comment_threshold:
node["comments"]["edges"] += comments
node["commentCount"] += len(comments)
still_above_threshold = True
else:
still_above_threshold = False
if still_above_threshold:
# remove unnecessary nesting
node["comments"] = node["comments"]["edges"]
ipr_list.append(node)
# thread started to add list of issues/prs to the database
# Thread(target=db.addMultipleDocs, args=(trimmed_request["edges"], db_name)).start()
print(f'{len(ipr_list)} {pull_type} gathered') # print progress
else:
print(f"Status code: {str(request.status_code)} on iteration {i}, pr_index = {pr_index}. Retrying")
i -= 1
discount += 1 # fail occurs when too much fetched at once, so fetch less next time
# pprint(json.dumps(ipr_list, indent=4))
return ipr_list
# gets comments for an issue/pr
def get_other_comments(number, repo, owner, p_type, headers, cursor=None):
# for pagination
has_next_page = True
comment_list = None
# query can only fetch at most 100 at a time, so keeps fetching until all fetched
while has_next_page:
# forms the query and performs call, on subsequent iterations passes in cursor for pagination
query = get_ind_query(repo, owner, number, p_type, cursor)
request = post("https://api.github.com/graphql", json={"query": query}, headers=headers)
# if api call was successful, adds the comment to the comment list
if request.status_code == 200:
# trims the result of the api call to remove unneeded nesting
# pprint(request.json())
try:
comments = request.json()["data"]["repository"][p_type]["comments"]
except TypeError:
print("Invalid information provided")
break
except KeyError:
print("error while pulling comments")
break
# pprint(trimmed_request)
# determines if all comments have been fetched
has_next_page = comments["pageInfo"]["hasNextPage"]
if has_next_page:
cursor = comments["pageInfo"]["endCursor"]
filtered_comments = filter_comments(comments["edges"])
# add to list
if comment_list is None:
comment_list = filtered_comments
else:
comment_list += filtered_comments
else:
print(f"Status code: {str(request.status_code)} while fetching comments. Retrying")
return comment_list
# filter out any comments made by bots
def filter_comments(comment_list):
return_list = []
# iterates through each comment removes it if it was made by a bot
for comment in comment_list:
if comment["node"]["author"] is not None:
if comment["node"]["author"]["__typename"] != "Bot" and comment["node"]["author"]["login"] not in bots:
comment["node"]["author"] = comment["node"]["author"]["login"] # remove if more author info needed
# comment["node"]["author"].pop("__typename") # add back if more info about author needed
return_list.append(comment["node"])
else:
comment["node"]["author"] = "deletedUser"
return_list.append(comment["node"])
return return_list
# writes a json_string out to a file
def write_to_file(json_string, repo, p_type):
cwd = os.getcwd()
filepath = cwd + "/fetched_data"
if not os.path.exists(filepath):
os.makedirs(filepath)
# writing data into repoName_pullType.json in cwd/fetched_data directory
with open(filepath + "/" + f"{repo}_{p_type}.json", "w") as outfile:
outfile.write(json_string)
# returns query for issue or pull request comments
def get_comments_query(repo, owner, p_type, pull_rate, discount, cursor=None):
# for pagination
if cursor is not None:
start_point = f', after: "{cursor}"'
else:
start_point = ""
if discount >= pull_rate[0]:
discount = pull_rate[0] - 1
query = """
{
repository(name: "%s", owner: "%s") {
%s(first:%d, orderBy:{field: COMMENTS, direction: DESC}%s) {
edges {
node {
number
title
author {
login
... on User {
name
}
}
state
createdAt
closedAt
createdAt
comments(first: %d) {
totalCount
edges {
node {
author {
login
__typename
}
bodyText
createdAt
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
""" % (repo, owner, p_type, pull_rate[0] - discount, start_point, pull_rate[1])
return query
# returns query for individual comments
def get_ind_query(repo, owner, number, p_type, cursor=None):
# for pagination
if cursor is not None:
start_point = f', after: "{cursor}"'
else:
start_point = ""
query = """
{
repository(name: "%s", owner: "%s") {
%s(number: %d) {
comments(first:100%s) {
edges {
node {
author {
login
__typename
}
bodyText
createdAt
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
}
""" % (repo, owner, p_type, number, start_point)
return query
# main function for testing code
if __name__ == '__main__':
print("Enter an access token: ", end="")
auth = input()
pull_type = ""
valid = False
while not valid:
print("Enter a repo (owner/repo): ", end="")
owner_repo = input().split("/")
if len(owner_repo) != 2:
print("Invalid input")
else:
print("Get issues or pull requests? (i or p): ", end="")
letter = input()
if letter == "i":
pull_type = "issues"
valid = True
elif letter == "p":
pull_type = "pullRequests"
valid = True
else:
print("Invalid input")
database = cloudant.Database("credentials/cloudant_credentials.json")
if pull_type == "pullRequests":
database_name = f"{owner_repo[0]}/{owner_repo[1]}-pull_requests"
else:
database_name = f"{owner_repo[0]}/{owner_repo[1]}-{pull_type}"
result = None
args = auth, owner_repo[0], owner_repo[1], pull_type
if database.checkDatabases(database_name):
print(f"{owner_repo[0]}/{owner_repo[1]}-{pull_type} is already in the database. Use existing data? (y/n): "
, end="")
valid = False
while not valid:
ans = input()
if ans == 'y':
print("Running analysis on existing data")
valid = True
elif ans == 'n':
database.clearDatabase(database_name)
result = run_query(*args)
valid = True
else:
print("Invalid input. Use existing data? (y/n): ")
else:
result = run_query(*args)
if result is not None:
write_to_file(result, owner_repo[1], pull_type)