2323import base64
2424from shutil import rmtree
2525
26- from aboutcode import hashid
26+ from aboutcode . pipeline import LoopProgress
2727from packagedcode .models import PackageData
2828from packagedcode .models import Party
2929from packageurl import PackageURL
30- from scanpipe .pipes import federatedcode
3130from scanpipe .pipes .fetch import fetch_http
3231from scanpipe .pipes .scancode import extract_archives
3332
34- from minecode_pipelines import pipes
3533from minecode_pipelines import VERSION
34+ from minecode_pipelines .pipelines import _mine_and_publish_packageurls
3635
3736ALPINE_CHECKPOINT_PATH = "alpine/checkpoints.json"
3837
39- # We are testing and storing mined packageURLs in one single repo per ecosystem for now
40- MINECODE_DATA_ALPINE_REPO = "https://github.com/aboutcode-data/minecode-data-alpine-test"
4138
4239# Number of packages
4340PACKAGE_BATCH_SIZE = 1000
@@ -530,20 +527,23 @@ def _fetch_index(self, uri):
530527 def get_packages (self , logger = None ):
531528 """Yield Package objects from alpine index"""
532529 for apkindex_url in ALPINE_LINUX_APKINDEX_URLS :
533- _ , subpath = apkindex_url .split ("https://dl-cdn.alpinelinux.org/alpine/" )
534- distro , repo , _ , _ = subpath .split ("/" )
535- index = self ._fetch_index (uri = apkindex_url )
536- extract_archives (location = index .path )
537- index_location = f"{ index .path } -extract/APKINDEX"
538- with open (index_location , encoding = "utf-8" ) as f :
539- for pkg in parse_apkindex (f .read ()):
540- pd = build_package (pkg , distro = distro , repo = repo )
541- current_purl = PackageURL (
542- type = pd .type ,
543- namespace = pd .namespace ,
544- name = pd .name ,
545- )
546- yield current_purl , pd
530+ self .get_package_from_index (apkindex_url )
531+
532+ def get_package_from_index (self , apkindex_url , logger = None ):
533+ _ , subpath = apkindex_url .split ("https://dl-cdn.alpinelinux.org/alpine/" )
534+ distro , repo , _ , _ = subpath .split ("/" )
535+ index = self ._fetch_index (uri = apkindex_url )
536+ extract_archives (location = index .path )
537+ index_location = f"{ index .path } -extract/APKINDEX"
538+ with open (index_location , encoding = "utf-8" ) as f :
539+ for pkg in parse_apkindex (f .read ()):
540+ pd = build_package (pkg , distro = distro , repo = repo )
541+ current_purl = PackageURL (
542+ type = pd .type ,
543+ namespace = pd .namespace ,
544+ name = pd .name ,
545+ )
546+ yield current_purl , [pd .purl ]
547547
548548
549549def commit_message (commit_batch , total_commit_batch = "many" ):
@@ -557,59 +557,32 @@ def commit_message(commit_batch, total_commit_batch="many"):
557557 Collect PackageURLs from Alpine ({ commit_batch } /{ total_commit_batch } )
558558
559559 Tool: { tool_name } @v{ VERSION }
560- Reference: https://{ settings .ALLOWED_HOSTS [0 ]}
561560
562561 Signed-off-by: { author_name } <{ author_email } >
563562 """
564563
565564
566- def collect_packages_from_alpine (files_per_commit = PACKAGE_BATCH_SIZE , logger = None ):
567- # Clone data and config repo
568- data_repo = federatedcode .clone_repository (
569- repo_url = MINECODE_DATA_ALPINE_REPO ,
570- logger = logger ,
571- )
572- config_repo = federatedcode .clone_repository (
573- repo_url = pipes .MINECODE_PIPELINES_CONFIG_REPO ,
565+ def mine_and_publish_alpine_packageurls (data_cluster , checked_out_repos , working_path , logger ):
566+ """Yield PackageURLs from Alpine index."""
567+
568+ index_count = len (ALPINE_LINUX_APKINDEX_URLS )
569+ progress = LoopProgress (
570+ total_iterations = index_count ,
574571 logger = logger ,
572+ progress_step = 1 ,
575573 )
576- if logger :
577- logger (f"{ MINECODE_DATA_ALPINE_REPO } repo cloned at: { data_repo .working_dir } " )
578- logger (f"{ pipes .MINECODE_PIPELINES_CONFIG_REPO } repo cloned at: { config_repo .working_dir } " )
579574
580- # download and iterate through alpine indices
575+ logger ( f"Mine PackageURL from { index_count :,d } alpine index." )
581576 alpine_collector = AlpineCollector ()
582- files_to_commit = []
583- commit_batch = 1
584- for current_purl , package in alpine_collector .get_packages ():
585- # write packageURL to file
586- package_base_dir = hashid .get_package_base_dir (purl = current_purl )
587- purl_file = pipes .write_packageurls_to_file (
588- repo = data_repo ,
589- base_dir = package_base_dir ,
590- packageurls = [package .purl ],
591- append = True ,
592- )
593- if purl_file not in files_to_commit :
594- files_to_commit .append (purl_file )
595-
596- if len (files_to_commit ) == files_per_commit :
597- federatedcode .commit_and_push_changes (
598- commit_message = commit_message (commit_batch ),
599- repo = data_repo ,
600- files_to_commit = files_to_commit ,
601- logger = logger ,
602- )
603- files_to_commit .clear ()
604- commit_batch += 1
605-
606- if files_to_commit :
607- federatedcode .commit_and_push_changes (
608- commit_message = commit_message (commit_batch ),
609- repo = data_repo ,
610- files_to_commit = files_to_commit ,
577+ for index in progress .iter (ALPINE_LINUX_APKINDEX_URLS ):
578+ logger (f"Mine PackageURL from { index } index." )
579+ _mine_and_publish_packageurls (
580+ packageurls = alpine_collector .get_package_from_index (index ),
581+ total_package_count = None ,
582+ data_cluster = data_cluster ,
583+ checked_out_repos = checked_out_repos ,
584+ working_path = working_path ,
585+ append_purls = True ,
586+ commit_msg_func = commit_message ,
611587 logger = logger ,
612588 )
613-
614- repos_to_clean = [data_repo , config_repo ]
615- return repos_to_clean
0 commit comments