|
23 | 23 | import base64 |
24 | 24 | from shutil import rmtree |
25 | 25 |
|
26 | | -from aboutcode import hashid |
| 26 | +from aboutcode.pipeline import LoopProgress |
27 | 27 | from packagedcode.models import PackageData |
28 | 28 | from packagedcode.models import Party |
29 | 29 | from packageurl import PackageURL |
30 | | -from scanpipe.pipes import federatedcode |
31 | 30 | from scanpipe.pipes.fetch import fetch_http |
32 | 31 | from scanpipe.pipes.scancode import extract_archives |
33 | 32 |
|
34 | | -from minecode_pipelines import pipes |
35 | | -from minecode_pipelines import VERSION |
| 33 | +from minecode_pipelines.pipelines import _mine_and_publish_packageurls |
36 | 34 |
|
37 | 35 | ALPINE_CHECKPOINT_PATH = "alpine/checkpoints.json" |
38 | 36 |
|
39 | | -# We are testing and storing mined packageURLs in one single repo per ecosystem for now |
40 | | -MINECODE_DATA_ALPINE_REPO = "https://github.com/aboutcode-data/minecode-data-alpine-test" |
41 | 37 |
|
42 | 38 | # Number of packages |
43 | 39 | PACKAGE_BATCH_SIZE = 1000 |
@@ -530,86 +526,52 @@ def _fetch_index(self, uri): |
530 | 526 | def get_packages(self, logger=None): |
531 | 527 | """Yield Package objects from alpine index""" |
532 | 528 | for apkindex_url in ALPINE_LINUX_APKINDEX_URLS: |
533 | | - _, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/") |
534 | | - distro, repo, _, _ = subpath.split("/") |
535 | | - index = self._fetch_index(uri=apkindex_url) |
536 | | - extract_archives(location=index.path) |
537 | | - index_location = f"{index.path}-extract/APKINDEX" |
538 | | - with open(index_location, encoding="utf-8") as f: |
539 | | - for pkg in parse_apkindex(f.read()): |
540 | | - pd = build_package(pkg, distro=distro, repo=repo) |
541 | | - current_purl = PackageURL( |
542 | | - type=pd.type, |
543 | | - namespace=pd.namespace, |
544 | | - name=pd.name, |
545 | | - ) |
546 | | - yield current_purl, pd |
547 | | - |
548 | | - |
549 | | -def commit_message(commit_batch, total_commit_batch="many"): |
550 | | - from django.conf import settings |
551 | | - |
552 | | - author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME |
553 | | - author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL |
554 | | - tool_name = "pkg:github/aboutcode-org/scancode.io" |
555 | | - |
556 | | - return f"""\ |
557 | | - Collect PackageURLs from Alpine ({commit_batch}/{total_commit_batch}) |
558 | | -
|
559 | | - Tool: {tool_name}@v{VERSION} |
560 | | - Reference: https://{settings.ALLOWED_HOSTS[0]} |
561 | | -
|
562 | | - Signed-off-by: {author_name} <{author_email}> |
563 | | - """ |
564 | | - |
565 | | - |
566 | | -def collect_packages_from_alpine(files_per_commit=PACKAGE_BATCH_SIZE, logger=None): |
567 | | - # Clone data and config repo |
568 | | - data_repo = federatedcode.clone_repository( |
569 | | - repo_url=MINECODE_DATA_ALPINE_REPO, |
| 529 | + self.get_package_from_index(apkindex_url) |
| 530 | + |
| 531 | + def get_package_from_index(self, apkindex_url, logger=None): |
| 532 | + _, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/") |
| 533 | + distro, repo, _, _ = subpath.split("/") |
| 534 | + index = self._fetch_index(uri=apkindex_url) |
| 535 | + extract_archives(location=index.path) |
| 536 | + index_location = f"{index.path}-extract/APKINDEX" |
| 537 | + with open(index_location, encoding="utf-8") as f: |
| 538 | + for pkg in parse_apkindex(f.read()): |
| 539 | + pd = build_package(pkg, distro=distro, repo=repo) |
| 540 | + current_purl = PackageURL( |
| 541 | + type=pd.type, |
| 542 | + namespace=pd.namespace, |
| 543 | + name=pd.name, |
| 544 | + ) |
| 545 | + yield current_purl, [pd.purl] |
| 546 | + |
| 547 | + |
| 548 | +def mine_and_publish_alpine_packageurls( |
| 549 | + data_cluster, |
| 550 | + checked_out_repos, |
| 551 | + working_path, |
| 552 | + commit_msg_func, |
| 553 | + logger, |
| 554 | +): |
| 555 | + """Yield PackageURLs from Alpine index.""" |
| 556 | + |
| 557 | + index_count = len(ALPINE_LINUX_APKINDEX_URLS) |
| 558 | + progress = LoopProgress( |
| 559 | + total_iterations=index_count, |
570 | 560 | logger=logger, |
| 561 | + progress_step=1, |
571 | 562 | ) |
572 | | - config_repo = federatedcode.clone_repository( |
573 | | - repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO, |
574 | | - logger=logger, |
575 | | - ) |
576 | | - if logger: |
577 | | - logger(f"{MINECODE_DATA_ALPINE_REPO} repo cloned at: {data_repo.working_dir}") |
578 | | - logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}") |
579 | 563 |
|
580 | | - # download and iterate through alpine indices |
| 564 | + logger(f"Mine PackageURL from {index_count:,d} alpine index.") |
581 | 565 | alpine_collector = AlpineCollector() |
582 | | - files_to_commit = [] |
583 | | - commit_batch = 1 |
584 | | - for current_purl, package in alpine_collector.get_packages(): |
585 | | - # write packageURL to file |
586 | | - package_base_dir = hashid.get_package_base_dir(purl=current_purl) |
587 | | - purl_file = pipes.write_packageurls_to_file( |
588 | | - repo=data_repo, |
589 | | - base_dir=package_base_dir, |
590 | | - packageurls=[package.purl], |
591 | | - append=True, |
592 | | - ) |
593 | | - if purl_file not in files_to_commit: |
594 | | - files_to_commit.append(purl_file) |
595 | | - |
596 | | - if len(files_to_commit) == files_per_commit: |
597 | | - federatedcode.commit_and_push_changes( |
598 | | - commit_message=commit_message(commit_batch), |
599 | | - repo=data_repo, |
600 | | - files_to_commit=files_to_commit, |
601 | | - logger=logger, |
602 | | - ) |
603 | | - files_to_commit.clear() |
604 | | - commit_batch += 1 |
605 | | - |
606 | | - if files_to_commit: |
607 | | - federatedcode.commit_and_push_changes( |
608 | | - commit_message=commit_message(commit_batch), |
609 | | - repo=data_repo, |
610 | | - files_to_commit=files_to_commit, |
| 566 | + for index in progress.iter(ALPINE_LINUX_APKINDEX_URLS): |
| 567 | + logger(f"Mine PackageURL from {index} index.") |
| 568 | + _mine_and_publish_packageurls( |
| 569 | + packageurls=alpine_collector.get_package_from_index(index), |
| 570 | + total_package_count=None, |
| 571 | + data_cluster=data_cluster, |
| 572 | + checked_out_repos=checked_out_repos, |
| 573 | + working_path=working_path, |
| 574 | + append_purls=True, |
| 575 | + commit_msg_func=commit_msg_func, |
611 | 576 | logger=logger, |
612 | 577 | ) |
613 | | - |
614 | | - repos_to_clean = [data_repo, config_repo] |
615 | | - return repos_to_clean |
0 commit comments