Skip to content

Commit ca33a89

Browse files
committed
Refactor alpine mining pipeline for git deployment
Resolves: #774 Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent ff0d374 commit ca33a89

File tree

3 files changed

+52
-87
lines changed

3 files changed

+52
-87
lines changed

minecode_pipelines/pipelines/mine_alpine.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,36 +20,28 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
24-
from scanpipe.pipes import federatedcode
23+
from minecode_pipelines.pipelines import MineCodeBasePipeline
2524

26-
from minecode_pipelines import pipes
2725
from minecode_pipelines.pipes import alpine
2826

2927

30-
class MineAlpine(Pipeline):
31-
"""
32-
Mine all packageURLs from an alpine index and publish them to
33-
a FederatedCode repo.
34-
"""
28+
class MineAlpine(MineCodeBasePipeline):
29+
"""Mine PackageURLs from alpine index and publish them to FederatedCode."""
3530

3631
@classmethod
3732
def steps(cls):
3833
return (
3934
cls.check_federatedcode_eligibility,
40-
cls.collect_packages_from_alpine,
41-
cls.delete_cloned_repos,
35+
cls.create_federatedcode_working_dir,
36+
cls.fetch_federation_config,
37+
cls.mine_and_publish_alpine_packageurls,
38+
cls.delete_working_dir,
4239
)
4340

44-
def check_federatedcode_eligibility(self):
45-
"""
46-
Check if the project fulfills the following criteria for
47-
pushing the project result to FederatedCode.
48-
"""
49-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
50-
51-
def collect_packages_from_alpine(self):
52-
self.repos = alpine.collect_packages_from_alpine(logger=self.log)
53-
54-
def delete_cloned_repos(self):
55-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
41+
def mine_and_publish_alpine_packageurls(self):
42+
alpine.mine_and_publish_alpine_packageurls(
43+
data_cluster=self.data_cluster,
44+
checked_out_repos=self.checked_out_repos,
45+
working_path=self.working_path,
46+
logger=self.log,
47+
)

minecode_pipelines/pipes/alpine.py

Lines changed: 37 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,18 @@
2323
import base64
2424
from shutil import rmtree
2525

26-
from aboutcode import hashid
26+
from aboutcode.pipeline import LoopProgress
2727
from packagedcode.models import PackageData
2828
from packagedcode.models import Party
2929
from packageurl import PackageURL
30-
from scanpipe.pipes import federatedcode
3130
from scanpipe.pipes.fetch import fetch_http
3231
from scanpipe.pipes.scancode import extract_archives
3332

34-
from minecode_pipelines import pipes
3533
from minecode_pipelines import VERSION
34+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
3635

3736
ALPINE_CHECKPOINT_PATH = "alpine/checkpoints.json"
3837

39-
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
40-
MINECODE_DATA_ALPINE_REPO = "https://github.com/aboutcode-data/minecode-data-alpine-test"
4138

4239
# Number of packages
4340
PACKAGE_BATCH_SIZE = 1000
@@ -530,20 +527,23 @@ def _fetch_index(self, uri):
530527
def get_packages(self, logger=None):
531528
"""Yield Package objects from alpine index"""
532529
for apkindex_url in ALPINE_LINUX_APKINDEX_URLS:
533-
_, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/")
534-
distro, repo, _, _ = subpath.split("/")
535-
index = self._fetch_index(uri=apkindex_url)
536-
extract_archives(location=index.path)
537-
index_location = f"{index.path}-extract/APKINDEX"
538-
with open(index_location, encoding="utf-8") as f:
539-
for pkg in parse_apkindex(f.read()):
540-
pd = build_package(pkg, distro=distro, repo=repo)
541-
current_purl = PackageURL(
542-
type=pd.type,
543-
namespace=pd.namespace,
544-
name=pd.name,
545-
)
546-
yield current_purl, pd
530+
self.get_package_from_index(apkindex_url)
531+
532+
def get_package_from_index(self, apkindex_url, logger=None):
533+
_, subpath = apkindex_url.split("https://dl-cdn.alpinelinux.org/alpine/")
534+
distro, repo, _, _ = subpath.split("/")
535+
index = self._fetch_index(uri=apkindex_url)
536+
extract_archives(location=index.path)
537+
index_location = f"{index.path}-extract/APKINDEX"
538+
with open(index_location, encoding="utf-8") as f:
539+
for pkg in parse_apkindex(f.read()):
540+
pd = build_package(pkg, distro=distro, repo=repo)
541+
current_purl = PackageURL(
542+
type=pd.type,
543+
namespace=pd.namespace,
544+
name=pd.name,
545+
)
546+
yield current_purl, [pd.purl]
547547

548548

549549
def commit_message(commit_batch, total_commit_batch="many"):
@@ -557,59 +557,32 @@ def commit_message(commit_batch, total_commit_batch="many"):
557557
Collect PackageURLs from Alpine ({commit_batch}/{total_commit_batch})
558558
559559
Tool: {tool_name}@v{VERSION}
560-
Reference: https://{settings.ALLOWED_HOSTS[0]}
561560
562561
Signed-off-by: {author_name} <{author_email}>
563562
"""
564563

565564

566-
def collect_packages_from_alpine(files_per_commit=PACKAGE_BATCH_SIZE, logger=None):
567-
# Clone data and config repo
568-
data_repo = federatedcode.clone_repository(
569-
repo_url=MINECODE_DATA_ALPINE_REPO,
570-
logger=logger,
571-
)
572-
config_repo = federatedcode.clone_repository(
573-
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
565+
def mine_and_publish_alpine_packageurls(data_cluster, checked_out_repos, working_path, logger):
566+
"""Yield PackageURLs from Alpine index."""
567+
568+
index_count = len(ALPINE_LINUX_APKINDEX_URLS)
569+
progress = LoopProgress(
570+
total_iterations=index_count,
574571
logger=logger,
572+
progress_step=1,
575573
)
576-
if logger:
577-
logger(f"{MINECODE_DATA_ALPINE_REPO} repo cloned at: {data_repo.working_dir}")
578-
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
579574

580-
# download and iterate through alpine indices
575+
logger(f"Mine PackageURL from {index_count:,d} alpine index.")
581576
alpine_collector = AlpineCollector()
582-
files_to_commit = []
583-
commit_batch = 1
584-
for current_purl, package in alpine_collector.get_packages():
585-
# write packageURL to file
586-
package_base_dir = hashid.get_package_base_dir(purl=current_purl)
587-
purl_file = pipes.write_packageurls_to_file(
588-
repo=data_repo,
589-
base_dir=package_base_dir,
590-
packageurls=[package.purl],
591-
append=True,
592-
)
593-
if purl_file not in files_to_commit:
594-
files_to_commit.append(purl_file)
595-
596-
if len(files_to_commit) == files_per_commit:
597-
federatedcode.commit_and_push_changes(
598-
commit_message=commit_message(commit_batch),
599-
repo=data_repo,
600-
files_to_commit=files_to_commit,
601-
logger=logger,
602-
)
603-
files_to_commit.clear()
604-
commit_batch += 1
605-
606-
if files_to_commit:
607-
federatedcode.commit_and_push_changes(
608-
commit_message=commit_message(commit_batch),
609-
repo=data_repo,
610-
files_to_commit=files_to_commit,
577+
for index in progress.iter(ALPINE_LINUX_APKINDEX_URLS):
578+
logger(f"Mine PackageURL from {index} index.")
579+
_mine_and_publish_packageurls(
580+
packageurls=alpine_collector.get_package_from_index(index),
581+
total_package_count=None,
582+
data_cluster=data_cluster,
583+
checked_out_repos=checked_out_repos,
584+
working_path=working_path,
585+
append_purls=True,
586+
commit_msg_func=commit_message,
611587
logger=logger,
612588
)
613-
614-
repos_to_clean = [data_repo, config_repo]
615-
return repos_to_clean

pyproject-minecode_pipelines.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b27"
7+
version = "0.0.1b29"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }

0 commit comments

Comments
 (0)