OpenSourceDynamics/query.py at main · yvah/OpenSourceDynamics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
from pprint import pprint
from requests import post
import os
from time import time
import cloudant
from threading import Thread

comment_threshold = 10
max_iterations = 20  # number of iterations that should run; -1 to keep going until all issues/prs fetched
# first in each tuple is
pull_rates = [(100, 3), (90, 7), (80, 9), (70, 12), (60, 15), (50, 20), (40, 25), (30, 35),
              (25, 50), (20, 60), (18, 68), (16, 75), (14, 80), (12, 95), (10, 100)]

# get a list of known bots
with open("data/bots.txt") as file:
    bots = file.read().splitlines()


# for timing how long it takes a function to run
def time_execution(function):
    def wrapper(*args):
        print(f"Timing {function.__name__}")

        start_time = time()
        value = function(*args)
        end_time = time()

        print(f"{function.__name__} took {round(end_time - start_time, 3)} seconds to run")
        return value

    return wrapper


# @time_execution  # uncomment to time function
def run_query(auth, owner, repo, pull_type):
    global max_iterations
    discount = 0

    print(f"Gathering {pull_type}...")
    # db.createDatabase(db_name)

    # final list to be returned
    ipr_list = []
    # stores the authorisation token and accept
    headers = {
        "Authorization": "token " + auth,
        "Accept": "application/vnd.github+json",
    }

    # for pagination
    has_next_page = True
    cursor = None
    above_threshold = True

    # initial pull rate is (12, 100)
    pr_index = -1

    # bug with github graphql api with sorting by comments
    # temporary "fix"
    if pull_type == "pullRequests":
        max_iterations = 1
        pr_index = 0

    i = 0
    # query can only fetch at most 100 at a time, so keeps fetching until all fetched
    while has_next_page and above_threshold and i != max_iterations:
        i += 1

        # forms the query and performs call, on subsequent iterations passes in cursor for pagination
        query = get_comments_query(repo, owner, pull_type, pull_rates[pr_index], discount, cursor)
        try:
            request = post("https://api.github.com/graphql", json={"query": query}, headers=headers)
        except Exception:
            print("s")
            print(f"error at iteration {i}")
            i -= 1
            continue

        # if api call was successful, adds the comment to the comment list
        if request.status_code == 200:
            try:
                # trims the result of the api call to remove unneeded nesting
                trimmed_request = request.json()["data"]["repository"][pull_type]
            except TypeError:
                if ipr_list is not None:
                    pprint(ipr_list)
                print("Invalid information provided")
                break
            # pprint(trimmed_request)

            # determines if all issues/prs have been fetched
            has_next_page = trimmed_request["pageInfo"]["hasNextPage"]
            if has_next_page:
                cursor = trimmed_request["pageInfo"]["endCursor"]

            # checks if any of the issues/prs have fewer than threshold comments
            # if so, remove them and don't fetch any more
            last_count = trimmed_request["edges"][-1]["node"]["comments"]["totalCount"]
            if last_count < comment_threshold:
                above_threshold = False
                for node in reversed(trimmed_request["edges"]):
                    if node["node"]["comments"]["totalCount"] < comment_threshold:
                        trimmed_request["edges"].pop()
                    else:
                        break
            else:
                # determine the pull rate for the next iteration
                for j, rate in enumerate(pull_rates):
                    if last_count <= rate[1]:
                        if j != pr_index:
                            pr_index = j
                            discount = 0  # if pr_index changes, reset the discount
                        break

            # loop through issues/prs
            for edge in trimmed_request["edges"]:
                # trim data
                node = edge["node"]
                if node["author"] is not None:
                    if node["author"]["name"] is not None:
                        node["author"] = node["author"]["name"]
                    else:
                        node["author"] = node["author"]["login"]  # remove if more info about author needed
                else:
                    node["author"] = "deletedUser"
                node["comments"]["edges"] = filter_comments(node["comments"]["edges"])

                # update the comment count
                count = len(node["comments"]["edges"])
                node["commentCount"] = count

                still_above_threshold = count >= comment_threshold
                # pull the rest of the comments if there are any
                if node["comments"]["pageInfo"]["hasNextPage"]:
                    comments = get_other_comments(node["number"], repo, owner, pull_type[0:-1],
                                                  headers, node["comments"]["pageInfo"]["endCursor"])

                    # add comments to exiting ones, update commentCount, if under threshold comments, remove
                    if count + len(comments) >= comment_threshold:
                        node["comments"]["edges"] += comments
                        node["commentCount"] += len(comments)
                        still_above_threshold = True
                    else:
                        still_above_threshold = False

                if still_above_threshold:
                    # remove unnecessary nesting
                    node["comments"] = node["comments"]["edges"]
                    ipr_list.append(node)

            # thread started to add list of issues/prs to the database
            # Thread(target=db.addMultipleDocs, args=(trimmed_request["edges"], db_name)).start()
            print(f'{len(ipr_list)} {pull_type} gathered')  # print progress

        else:
            print(f"Status code: {str(request.status_code)} on iteration {i}, pr_index = {pr_index}. Retrying")
            i -= 1
            discount += 1  # fail occurs when too much fetched at once, so fetch less next time

    # pprint(json.dumps(ipr_list, indent=4))
    return ipr_list


# gets comments for an issue/pr
def get_other_comments(number, repo, owner, p_type, headers, cursor=None):

    # for pagination
    has_next_page = True
    comment_list = None

    # query can only fetch at most 100 at a time, so keeps fetching until all fetched
    while has_next_page:

        # forms the query and performs call, on subsequent iterations passes in cursor for pagination
        query = get_ind_query(repo, owner, number, p_type, cursor)
        request = post("https://api.github.com/graphql", json={"query": query}, headers=headers)

        # if api call was successful, adds the comment to the comment list
        if request.status_code == 200:
            # trims the result of the api call to remove unneeded nesting
            # pprint(request.json())
            try:
                comments = request.json()["data"]["repository"][p_type]["comments"]
            except TypeError:
                print("Invalid information provided")
                break
            except KeyError:
                print("error while pulling comments")
                break
            # pprint(trimmed_request)

            # determines if all comments have been fetched
            has_next_page = comments["pageInfo"]["hasNextPage"]
            if has_next_page:
                cursor = comments["pageInfo"]["endCursor"]

            filtered_comments = filter_comments(comments["edges"])

            # add to list
            if comment_list is None:
                comment_list = filtered_comments
            else:
                comment_list += filtered_comments
        else:
            print(f"Status code: {str(request.status_code)} while fetching comments. Retrying")

    return comment_list


# filter out any comments made by bots
def filter_comments(comment_list):
    return_list = []
    # iterates through each comment removes it if it was made by a bot
    for comment in comment_list:
        if comment["node"]["author"] is not None:
            if comment["node"]["author"]["__typename"] != "Bot" and comment["node"]["author"]["login"] not in bots:
                comment["node"]["author"] = comment["node"]["author"]["login"]  # remove if more author info needed
                # comment["node"]["author"].pop("__typename")  # add back if more info about author needed
                return_list.append(comment["node"])
        else:
            comment["node"]["author"] = "deletedUser"
            return_list.append(comment["node"])

    return return_list


# writes a json_string out to a file
def write_to_file(json_string, repo, p_type):
    cwd = os.getcwd()
    filepath = cwd + "/fetched_data"
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    # writing data into repoName_pullType.json in cwd/fetched_data directory
    with open(filepath + "/" + f"{repo}_{p_type}.json", "w") as outfile:
        outfile.write(json_string)


# returns query for issue or pull request comments
def get_comments_query(repo, owner, p_type, pull_rate, discount, cursor=None):
    # for pagination
    if cursor is not None:
        start_point = f', after: "{cursor}"'
    else:
        start_point = ""

    if discount >= pull_rate[0]:
        discount = pull_rate[0] - 1

    query = """
        {
            repository(name: "%s", owner: "%s") {
                %s(first:%d, orderBy:{field: COMMENTS, direction: DESC}%s) {
                    edges {
                        node {
                            number
                            title
                            author {
                                login
                                ... on User {
                                    name
                                }
                            }
                            state
                            createdAt
                            closedAt
                            createdAt
                            comments(first: %d) {
                                totalCount
                                edges {
                                    node {
                                        author {
                                            login
                                            __typename
                                        }
                                        bodyText
                                        createdAt
                                    }
                                }
                                pageInfo {
                                    hasNextPage
                                    endCursor
                                }
                            }
                        }
                    }
                    pageInfo {
                        hasNextPage
                        endCursor
                    }
                }
            }
        }
        """ % (repo, owner, p_type, pull_rate[0] - discount, start_point, pull_rate[1])

    return query


# returns query for individual comments
def get_ind_query(repo, owner, number, p_type, cursor=None):
    # for pagination
    if cursor is not None:
        start_point = f', after: "{cursor}"'
    else:
        start_point = ""

    query = """
    {
        repository(name: "%s", owner: "%s") {
            %s(number: %d) {
                comments(first:100%s) {
                    edges {
                        node {
                            author {
                                login
                                __typename
                            }
                            bodyText
                            createdAt
                        }
                    }
                    pageInfo {
                        hasNextPage
                        endCursor
                    }
                }
            }
        }
    }
    """ % (repo, owner, p_type, number, start_point)

    return query


# main function for testing code
if __name__ == '__main__':

    print("Enter an access token: ", end="")
    auth = input()

    pull_type = ""
    valid = False

    while not valid:
        print("Enter a repo (owner/repo): ", end="")
        owner_repo = input().split("/")
        if len(owner_repo) != 2:
            print("Invalid input")
        else:
            print("Get issues or pull requests? (i or p): ", end="")
            letter = input()

            if letter == "i":
                pull_type = "issues"
                valid = True
            elif letter == "p":
                pull_type = "pullRequests"
                valid = True
            else:
                print("Invalid input")

    database = cloudant.Database("credentials/cloudant_credentials.json")
    if pull_type == "pullRequests":
        database_name = f"{owner_repo[0]}/{owner_repo[1]}-pull_requests"
    else:
        database_name = f"{owner_repo[0]}/{owner_repo[1]}-{pull_type}"

    result = None
    args = auth, owner_repo[0], owner_repo[1], pull_type
    if database.checkDatabases(database_name):
        print(f"{owner_repo[0]}/{owner_repo[1]}-{pull_type} is already in the database. Use existing data? (y/n): "
              , end="")
        valid = False

        while not valid:
            ans = input()
            if ans == 'y':
                print("Running analysis on existing data")
                valid = True
            elif ans == 'n':
                database.clearDatabase(database_name)
                result = run_query(*args)
                valid = True
            else:
                print("Invalid input. Use existing data? (y/n): ")

    else:
        result = run_query(*args)

    if result is not None:
        write_to_file(result, owner_repo[1], pull_type)