Skip to content

Commit 76c4348

Browse files
committed
Refactor alpine mining pipeline for git deployment
Resolves: #774 Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent ff0d374 commit 76c4348

File tree

3 files changed

+61
-106
lines changed

3 files changed

+61
-106
lines changed

minecode_pipelines/pipelines/mine_alpine.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,36 +20,29 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
24-
from scanpipe.pipes import federatedcode
23+
from minecode_pipelines.pipelines import MineCodeBasePipeline
2524

26-
from minecode_pipelines import pipes
2725
from minecode_pipelines.pipes import alpine
2826

2927

30-
class MineAlpine(Pipeline):
31-
"""
32-
Mine all packageURLs from an alpine index and publish them to
33-
a FederatedCode repo.
34-
"""
28+
class MineAlpine(MineCodeBasePipeline):
29+
"""Mine PackageURLs from alpine index and publish them to FederatedCode."""
3530

3631
@classmethod
3732
def steps(cls):
3833
return (
3934
cls.check_federatedcode_eligibility,
40-
cls.collect_packages_from_alpine,
41-
cls.delete_cloned_repos,
35+
cls.create_federatedcode_working_dir,
36+
cls.fetch_federation_config,
37+
cls.mine_and_publish_alpine_packageurls,
38+
cls.delete_working_dir,
4239
)
4340

44-
def check_federatedcode_eligibility(self):
45-
"""
46-
Check if the project fulfills the following criteria for
47-
pushing the project result to FederatedCode.
48-
"""
49-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
50-
51-
def collect_packages_from_alpine(self):
52-
self.repos = alpine.collect_packages_from_alpine(logger=self.log)
53-
54-
def delete_cloned_repos(self):
55-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
41+
def mine_and_publish_alpine_packageurls(self):
42+
alpine.mine_and_publish_alpine_packageurls(
43+
data_cluster=self.data_cluster,
44+
checked_out_repos=self.checked_out_repos,
45+
working_path=self.working_path,
46+
commit_msg_func=self.commit_message,
47+
logger=self.log,
48+
)

minecode_pipelines/pipes/alpine.py

Lines changed: 45 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,17 @@
2323
import base64
2424
from shutil import rmtree
2525

26-
from aboutcode import hashid
26+
from aboutcode.pipeline import LoopProgress
2727
from packagedcode.models import PackageData
2828
from packagedcode.models import Party
2929
from packageurl import PackageURL
30-
from scanpipe.pipes import federatedcode
3130
from scanpipe.pipes.fetch import fetch_http
3231
from scanpipe.pipes.scancode import extract_archives
3332

34-
from minecode_pipelines import pipes
35-
from minecode_pipelines import VERSION
33+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
3634

3735
ALPINE_CHECKPOINT_PATH = "alpine/checkpoints.json"
3836

39-
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
40-
MINECODE_DATA_ALPINE_REPO = "https://github.com/aboutcode-data/minecode-data-alpine-test"
4137

4238
# Number of packages
4339
PACKAGE_BATCH_SIZE = 1000
@@ -530,86 +526,52 @@ def _fetch_index(self, uri):
530526
def get_packages(self, logger=None):
531527
"""Yield Package objects from alpine index"""
532528
for apkindex_url in ALPINE_LINUX_APKINDEX_URLS:
533-
_, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/")
534-
distro, repo, _, _ = subpath.split("/")
535-
index = self._fetch_index(uri=apkindex_url)
536-
extract_archives(location=index.path)
537-
index_location = f"{index.path}-extract/APKINDEX"
538-
with open(index_location, encoding="utf-8") as f:
539-
for pkg in parse_apkindex(f.read()):
540-
pd = build_package(pkg, distro=distro, repo=repo)
541-
current_purl = PackageURL(
542-
type=pd.type,
543-
namespace=pd.namespace,
544-
name=pd.name,
545-
)
546-
yield current_purl, pd
547-
548-
549-
def commit_message(commit_batch, total_commit_batch="many"):
550-
from django.conf import settings
551-
552-
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
553-
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
554-
tool_name = "pkg:github/aboutcode-org/scancode.io"
555-
556-
return f"""\
557-
Collect PackageURLs from Alpine ({commit_batch}/{total_commit_batch})
558-
559-
Tool: {tool_name}@v{VERSION}
560-
Reference: https://{settings.ALLOWED_HOSTS[0]}
561-
562-
Signed-off-by: {author_name} <{author_email}>
563-
"""
564-
565-
566-
def collect_packages_from_alpine(files_per_commit=PACKAGE_BATCH_SIZE, logger=None):
567-
# Clone data and config repo
568-
data_repo = federatedcode.clone_repository(
569-
repo_url=MINECODE_DATA_ALPINE_REPO,
529+
self.get_package_from_index(apkindex_url)
530+
531+
def get_package_from_index(self, apkindex_url, logger=None):
532+
_, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/")
533+
distro, repo, _, _ = subpath.split("/")
534+
index = self._fetch_index(uri=apkindex_url)
535+
extract_archives(location=index.path)
536+
index_location = f"{index.path}-extract/APKINDEX"
537+
with open(index_location, encoding="utf-8") as f:
538+
for pkg in parse_apkindex(f.read()):
539+
pd = build_package(pkg, distro=distro, repo=repo)
540+
current_purl = PackageURL(
541+
type=pd.type,
542+
namespace=pd.namespace,
543+
name=pd.name,
544+
)
545+
yield current_purl, [pd.purl]
546+
547+
548+
def mine_and_publish_alpine_packageurls(
549+
data_cluster,
550+
checked_out_repos,
551+
working_path,
552+
commit_msg_func,
553+
logger,
554+
):
555+
"""Yield PackageURLs from Alpine index."""
556+
557+
index_count = len(ALPINE_LINUX_APKINDEX_URLS)
558+
progress = LoopProgress(
559+
total_iterations=index_count,
570560
logger=logger,
561+
progress_step=1,
571562
)
572-
config_repo = federatedcode.clone_repository(
573-
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
574-
logger=logger,
575-
)
576-
if logger:
577-
logger(f"{MINECODE_DATA_ALPINE_REPO} repo cloned at: {data_repo.working_dir}")
578-
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
579563

580-
# download and iterate through alpine indices
564+
logger(f"Mine PackageURL from {index_count:,d} alpine index.")
581565
alpine_collector = AlpineCollector()
582-
files_to_commit = []
583-
commit_batch = 1
584-
for current_purl, package in alpine_collector.get_packages():
585-
# write packageURL to file
586-
package_base_dir = hashid.get_package_base_dir(purl=current_purl)
587-
purl_file = pipes.write_packageurls_to_file(
588-
repo=data_repo,
589-
base_dir=package_base_dir,
590-
packageurls=[package.purl],
591-
append=True,
592-
)
593-
if purl_file not in files_to_commit:
594-
files_to_commit.append(purl_file)
595-
596-
if len(files_to_commit) == files_per_commit:
597-
federatedcode.commit_and_push_changes(
598-
commit_message=commit_message(commit_batch),
599-
repo=data_repo,
600-
files_to_commit=files_to_commit,
601-
logger=logger,
602-
)
603-
files_to_commit.clear()
604-
commit_batch += 1
605-
606-
if files_to_commit:
607-
federatedcode.commit_and_push_changes(
608-
commit_message=commit_message(commit_batch),
609-
repo=data_repo,
610-
files_to_commit=files_to_commit,
566+
for index in progress.iter(ALPINE_LINUX_APKINDEX_URLS):
567+
logger(f"Mine PackageURL from {index} index.")
568+
_mine_and_publish_packageurls(
569+
packageurls=alpine_collector.get_package_from_index(index),
570+
total_package_count=None,
571+
data_cluster=data_cluster,
572+
checked_out_repos=checked_out_repos,
573+
working_path=working_path,
574+
append_purls=True,
575+
commit_msg_func=commit_msg_func,
611576
logger=logger,
612577
)
613-
614-
repos_to_clean = [data_repo, config_repo]
615-
return repos_to_clean

pyproject-minecode_pipelines.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b27"
7+
version = "0.0.1b30"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }

0 commit comments

Comments
 (0)