auto-docs/after.py at master · jweissenberger/auto-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import newspaper


def find_max(values):
    """
    Find the maximum value in values
    """
    return max(values)


def average(values: list) -> int:
    """
    Average of list of ints
    """

    total = 0
    for i in values:
        total += i

    return total/len(values)


def chunks(l, n):
    """
    Split a list into n - sized chunks .
    """
    d, r = divmod(len(l), n)
    for i in range(n):
        si = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
        yield l[si:si + (d + 1 if i < r else d)]


def pull_articles_from_source(url, source, article_data=[]):
    """
    Download articles from a given source and parse them .
    """
    paper = newspaper.build(url)
    i = 0
    failed = 0
    print(len(paper.articles))
    paper.download_articles()
    paper.parse_articles()  # remove articles that are too small (probably not articles)
    print(len(paper.articles))
    for article in paper.articles:
        i += 1
        if i > 10:
            break
        try:
            # fail if the article is empty or less than 40 words
            if article.text.isspace() or article.text == '' or len(article.text.split(' ')) < 40:
                failed += 1
                continue
            article.nlp()

            authors = article.authors
            temp = []
            for i in authors:
                if len(i.split(' ')) > 5:
                    continue
                temp.append(i)
            authors = temp

            data = {'source': source, 'title': article.title, 'authors': authors, 'text': article.text,
                    'keywords': article.keywords, 'summary': article.summary, 'url': article.url,
                    'date': article.publish_date}
            article_data.append(data)
        except:
            failed += 1

    return article_data


def source_from_url(link):
    """
    Given a link to a website return the source .
    """

    if 'www' in link:
        source = link.split('.')[1]
    else:
        if '.com' in link:
            source = link.split('.com')[0]
        else:
            source = link.split('.')[0]
    source = source.replace('https://', '')
    source = source.replace('http://', '')
    return source