-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathafter.py
More file actions
87 lines (71 loc) · 2.17 KB
/
after.py
File metadata and controls
87 lines (71 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import newspaper
def find_max(values):
"""
Find the maximum value in values
"""
return max(values)
def average(values: list) -> int:
"""
Average of list of ints
"""
total = 0
for i in values:
total += i
return total/len(values)
def chunks(l, n):
"""
Split a list into n - sized chunks .
"""
d, r = divmod(len(l), n)
for i in range(n):
si = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
yield l[si:si + (d + 1 if i < r else d)]
def pull_articles_from_source(url, source, article_data=[]):
"""
Download articles from a given source and parse them .
"""
paper = newspaper.build(url)
i = 0
failed = 0
print(len(paper.articles))
paper.download_articles()
paper.parse_articles() # remove articles that are too small (probably not articles)
print(len(paper.articles))
for article in paper.articles:
i += 1
if i > 10:
break
try:
# fail if the article is empty or less than 40 words
if article.text.isspace() or article.text == '' or len(article.text.split(' ')) < 40:
failed += 1
continue
article.nlp()
authors = article.authors
temp = []
for i in authors:
if len(i.split(' ')) > 5:
continue
temp.append(i)
authors = temp
data = {'source': source, 'title': article.title, 'authors': authors, 'text': article.text,
'keywords': article.keywords, 'summary': article.summary, 'url': article.url,
'date': article.publish_date}
article_data.append(data)
except:
failed += 1
return article_data
def source_from_url(link):
"""
Given a link to a website return the source .
"""
if 'www' in link:
source = link.split('.')[1]
else:
if '.com' in link:
source = link.split('.com')[0]
else:
source = link.split('.')[0]
source = source.replace('https://', '')
source = source.replace('http://', '')
return source