Skip to content

Commit 64ddb7a

Browse files
committed
Improve license download script and fix handling of redirects
1 parent b6ab56b commit 64ddb7a

File tree

1 file changed

+145
-67
lines changed

1 file changed

+145
-67
lines changed

scripts/fetch_requirement_licences.py

Lines changed: 145 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,12 @@
1111
for the python packages that this script failed to locate.
1212
4) Delete the old license files.
1313
"""
14+
15+
import argparse
1416
import json
17+
import logging
18+
import sys
19+
import time
1520
from typing import (
1621
Sequence,
1722
)
@@ -24,80 +29,153 @@
2429
)
2530

2631
from azul import (
32+
cached_property,
2733
config,
28-
require,
34+
)
35+
from azul.args import (
36+
AzulArgumentHelpFormatter,
2937
)
3038
from azul.http import (
39+
HttpClient,
3140
http_client,
3241
)
42+
from azul.logging import (
43+
configure_script_logging,
44+
)
3345

46+
log = logging.getLogger(__name__)
3447

35-
def github_urls(urls: Sequence[str]) -> set[str]:
36-
"""
37-
Return the GitHub URLs from the list of URLs given.
38-
"""
39-
urls_ = set()
40-
for url in urls:
41-
url_ = furl(url.rstrip('/'))
42-
if url_.netloc == 'github.com':
43-
last_segment = url_.path.segments[-1] if url_.path.segments else ''
44-
if last_segment == 'issues':
45-
# https://github.com/USER/PACKAGE/issues
46-
url_.path.segments.pop()
47-
elif last_segment.endswith('.git'):
48-
# https://github.com/googleapis/proto-plus-python.git
49-
url_.path.segments[-1] = last_segment[:-4]
50-
urls_.add(str(url_))
51-
return urls_
52-
53-
54-
destination_path = f'{config.project_root}/docs/licenses/python/'
55-
56-
license_file_names = [
57-
'LICENSE',
58-
'LICENSE.txt',
59-
'LICENSE.rst',
60-
'LICENSE.md',
61-
'LICENSE.mit',
62-
'COPYING',
63-
'COPYING.BSD',
64-
'LICENCE',
65-
'LICENCE.md'
66-
]
67-
68-
http = http_client()
69-
70-
with open(f'{config.project_root}/requirements.all.txt', 'r') as f:
71-
lines = f.readlines()
72-
73-
failures = []
74-
for line in lines:
75-
if line:
76-
package, version = line.split('==')
77-
pypi_url = f'https://pypi.org/pypi/{package}'
78-
response = http.request('GET', f'{pypi_url}/json')
79-
assert isinstance(response, HTTPResponse)
80-
require(response.status == 200, response)
81-
urls = json.loads(response.data)['info']['project_urls']
82-
found = False
83-
for url in github_urls(urls.values()):
84-
for filename in license_file_names:
85-
response = http.request('GET', f'{url}/raw/HEAD/{filename}')
48+
49+
class FetchLicenses:
50+
destination_path = f'{config.project_root}/docs/licenses/python/'
51+
52+
license_file_names = [
53+
'LICENSE',
54+
'LICENSE.txt',
55+
'LICENSE.rst',
56+
'LICENSE.md',
57+
'LICENSE.mit',
58+
'COPYING',
59+
'COPYING.BSD',
60+
'LICENCE',
61+
'LICENCE.md'
62+
]
63+
64+
def main(self, argv):
65+
parser = argparse.ArgumentParser(description=__doc__,
66+
formatter_class=AzulArgumentHelpFormatter)
67+
parser.add_argument('--package', '-p',
68+
help='Optionally specify one or more packages to '
69+
'download from. If not specified, licenses from '
70+
'all Python dependencies will be downloaded.',
71+
nargs='+',
72+
metavar='PACKAGE',
73+
)
74+
parser.add_argument('--debug',
75+
action='store_true',
76+
help='Log debugging information')
77+
args = parser.parse_args(argv)
78+
79+
packages = []
80+
failures = []
81+
82+
if args.package:
83+
packages = [p for p in args.package]
84+
else:
85+
with open(f'{config.project_root}/requirements.all.txt', 'r') as f:
86+
packages = [p.split('==')[0] for p in f.readlines()]
87+
88+
for package in packages:
89+
if package:
90+
pypi_url = f'https://pypi.org/pypi/{package}'
91+
response = self.get_response(f'{pypi_url}/json')
8692
assert isinstance(response, HTTPResponse)
93+
found = False
8794
if response.status == 200:
88-
file_path = f'{destination_path}{package}.txt'
89-
with open(file_path, 'wb') as f:
90-
f.write(f'{url}/{filename}\n\n'.encode('ascii'))
91-
f.write(response.data)
92-
print(package, '... done.')
93-
found = True
94-
break
95-
if found:
96-
break
97-
else:
98-
failures.append(package)
99-
print(package, '... FAIL', pypi_url)
95+
urls = json.loads(response.data)['info']['project_urls']
96+
if urls:
97+
if args.debug:
98+
log.debug('%s urls: %s', package, urls)
99+
for url in self.github_urls(urls.values()):
100+
url_raw = furl(url)
101+
if len(url_raw.path.segments) > 2:
102+
if url_raw.path.segments[2] in ('blob', 'tree'):
103+
url_raw.path.segments[2] = 'raw'
104+
else:
105+
url_raw.path.segments.extend(['raw', 'HEAD'])
106+
url_blob = url_raw.copy()
107+
url_blob.path.segments[2] = 'blob'
108+
for filename in self.license_file_names:
109+
response = self.get_response(f'{url_raw}/{filename}')
110+
assert isinstance(response, HTTPResponse)
111+
if response.status == 200:
112+
if args.debug:
113+
log.debug('Found %s/%s', url_raw, filename)
114+
file_path = f'{self.destination_path}{package}.txt'
115+
with open(file_path, 'wb') as f:
116+
f.write(f'{url_blob}/{filename}\n\n'.encode('ascii'))
117+
f.write(response.data)
118+
log.info('%s... SUCCESS', package)
119+
found = True
120+
break
121+
if found:
122+
break
123+
if not found:
124+
failures.append(package)
125+
log.info('%s... FAIL (%s)', package, pypi_url)
126+
127+
if failures:
128+
log.error('Failed to fetch licenses for packages: %s', failures)
129+
130+
@cached_property
131+
def http(self) -> HttpClient:
132+
return http_client()
133+
134+
def get_response(self, url: str) -> HTTPResponse:
135+
while True:
136+
response = self.http.request('GET', url)
137+
if response.status in [301, 302]:
138+
url = response.get_redirect_location()
139+
retry_after = response.headers.get('Retry-After')
140+
if retry_after is not None:
141+
print('Sleeping %.3fs to honor Retry-After property' % retry_after)
142+
time.sleep(retry_after)
143+
else:
144+
return response
145+
146+
def github_urls(self, urls: Sequence[str]) -> list[str]:
147+
"""
148+
Return GitHub project URLs found in the list of URLs given.
149+
"""
150+
urls_: set[str] = set()
151+
for url in urls:
152+
url_ = furl(url).remove(args=True, fragment=True)
153+
# Remove empty segment if url ended in a '/'
154+
if url_.path.segments and url_.path.segments[-1] == '':
155+
url_.path.segments.pop()
156+
if url_.netloc == 'github.com' and url_.path.segments:
157+
last_segment = url_.path.segments[-1]
158+
# Remove '.git' from URL
159+
# https://github.com/USER/PACKAGE.git
160+
if last_segment.endswith('.git'):
161+
url_.path.segments[-1] = last_segment[:-4]
162+
# Remove README file from path
163+
# https://github.com/USER/README.md
164+
elif last_segment.endswith('.md') or last_segment.endswith('.rst'):
165+
url_.path.segments.pop()
166+
# Remove extra segment from path
167+
# https://github.com/USER/PACKAGE/issues
168+
# Note we can't just chop segments at [:2] due to projects like:
169+
# https://github.com/googleapis/google-cloud-python/blob/main/packages/google-cloud-bigquery-reservation
170+
elif (
171+
len(url_.path.segments) > 2
172+
and last_segment in ('discussions', 'issues', 'pulls', 'wiki')
173+
):
174+
url_.path.segments.pop()
175+
urls_.add(str(url_))
176+
return sorted(urls_)
177+
100178

101-
if failures:
102-
print()
103-
raise Exception(f'Python package license files not found: {failures}')
179+
if __name__ == '__main__':
180+
configure_script_logging(log)
181+
FetchLicenses().main(sys.argv[1:])

0 commit comments

Comments
 (0)