Add HuggingFace likes

raphaelsty · raphaelsty · commit 6d3bda04baa3 · 2025-12-29T11:32:43.000+01:00
diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml
@@ -5,7 +5,7 @@ on:
     branches:
       - main
   schedule:
-    - cron: '0 1 * * *'
+    - cron: "0 1 * * *"
 
 jobs:
   build:
@@ -23,7 +23,7 @@ jobs:
       - name: setup python
         uses: actions/setup-python@v2
         with:
-          python-version: '3.10'
+          python-version: "3.10"
 
       - name: install python packages
         run: |
@@ -40,6 +40,7 @@ jobs:
           HACKERNEWS_USERNAME: ${{ secrets.HACKERNEWS_USERNAME }}
           ZOTERO_API_KEY: ${{ secrets.ZOTERO_API_KEY }}
           ZOTERO_LIBRARY_ID: ${{ secrets.ZOTERO_LIBRARY_ID }}
+          HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
         run: python run.py
 
       - name: commit files
diff --git a/knowledge_database/hackernews/hackernews.py b/knowledge_database/hackernews/hackernews.py
@@ -1,9 +1,7 @@
 import datetime
-import json
 import re
 
 import requests
-import tqdm
 import trafilatura
 from bs4 import BeautifulSoup
 
@@ -43,7 +41,6 @@ def __call__(self):
         data = {}
 
         with requests.Session() as session:
-
             p = session.post(
                 "https://news.ycombinator.com/login?goto=news",
                 data={"acct": self.username, "pw": self.password},
@@ -59,7 +56,6 @@ def __call__(self):
             soup = BeautifulSoup(html, "html.parser")
 
             for entry in soup.find_all("td", class_="title"):
-
                 record = entry.find("a")
                 if record is None:
                     continue
diff --git a/knowledge_database/huggingface/__init__.py b/knowledge_database/huggingface/__init__.py
@@ -0,0 +1,3 @@
+from .huggingface import HuggingFace
+
+__all__ = ["HuggingFace"]
diff --git a/knowledge_database/huggingface/huggingface.py b/knowledge_database/huggingface/huggingface.py
@@ -0,0 +1,140 @@
+import datetime
+import re
+
+import requests
+import trafilatura
+from huggingface_hub import HfApi
+
+__all__ = ["HuggingFace"]
+
+
+class HuggingFace:
+    """HuggingFace liked models, datasets, and spaces.
+
+    Parameters
+    ----------
+    token : str, optional
+        HuggingFace User Access Token. If not provided, it relies on
+        local authentication (huggingface-cli login).
+    """
+
+    def __init__(self, token: str = None):
+        self.token = token
+        self.api = HfApi(token=self.token)
+
+    def __call__(self):
+        """Get liked content on HuggingFace."""
+        data = {}
+
+        try:
+            likes = self.api.list_liked_repos()
+        except Exception as e:
+            print(f"Error fetching likes: {e}")
+            return data
+
+        # Process Models
+        if hasattr(likes, "models"):
+            for model in likes.models:
+                repo_id = model.repo_id if hasattr(model, "repo_id") else str(model)
+                url = f"https://huggingface.co/{repo_id}"
+
+                # Fetch raw README for summary
+                raw_url = f"https://huggingface.co/{repo_id}/resolve/main/README.md"
+                self._process_entry(data, url, raw_url, repo_id, "model")
+
+        # Process Datasets
+        if hasattr(likes, "datasets"):
+            for dataset in likes.datasets:
+                repo_id = (
+                    dataset.repo_id if hasattr(dataset, "repo_id") else str(dataset)
+                )
+                dataset_url = f"https://huggingface.co/datasets/{repo_id}"
+
+                # specific logic to find branch for datasets
+                branch = self._get_default_branch(repo_id, "dataset")
+                raw_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{branch}/README.md"
+
+                self._process_entry(data, dataset_url, raw_url, repo_id, "dataset")
+
+        # Process Spaces
+        if hasattr(likes, "spaces"):
+            for space in likes.spaces:
+                repo_id = space.repo_id if hasattr(space, "repo_id") else str(space)
+                url = f"https://huggingface.co/spaces/{repo_id}"
+
+                # specific logic to find branch for spaces
+                branch = self._get_default_branch(repo_id, "space")
+                raw_url = f"https://huggingface.co/spaces/{repo_id}/resolve/{branch}/README.md"
+
+                self._process_entry(data, url, raw_url, repo_id, "space")
+
+        return data
+
+    def _get_default_branch(self, repo_id, repo_type):
+        """Helper to safely get default branch."""
+        try:
+            repo_info = self.api.repo_info(repo_id=repo_id, repo_type=repo_type)
+            return repo_info.default_branch if repo_info.default_branch else "main"
+        except Exception:
+            return "main"
+
+    def _process_entry(self, data, data_url, summarization_url, title_suffix, tag_type):
+        """Helper to fetch summary and populate data dict."""
+        print(f"Processing {tag_type}: {title_suffix}")
+        summary = self.get_summary(summarization_url)
+
+        data[data_url] = {
+            "title": f"🤗 HuggingFace {title_suffix}",
+            "tags": ["huggingface", tag_type],
+            "summary": summary,
+            "date": datetime.datetime.today().strftime("%Y-%m-%d"),
+        }
+
+    @staticmethod
+    def get_summary(url, num_tokens=50):
+        """
+        Fetches the content from a URL.
+        If it's a raw Markdown file with YAML frontmatter, it strips the metadata.
+        Otherwise, it attempts to extract text using trafilatura.
+        """
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            }
+            response = requests.get(url, headers=headers, timeout=15)
+            response.raise_for_status()
+            content = response.text
+
+            # Check for YAML frontmatter (starts with ---)
+            if content.strip().startswith("---"):
+                # Regex to remove the first block enclosed in ---
+                # DOTALL allows . to match newlines
+                cleaned_text = re.sub(
+                    r"^---\n.*?\n---", "", content, count=1, flags=re.DOTALL
+                )
+
+                # Since it is markdown, we might want to strip common markdown syntax
+                # for a cleaner summary (optional, but makes it readable)
+                cleaned_text = re.sub(r"[#*`]", "", cleaned_text)
+                cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
+
+            else:
+                # Fallback to trafilatura for non-markdown/HTML content
+                core_text = trafilatura.extract(content)
+                if not core_text:
+                    # If trafilatura fails (e.g. on raw text), use raw content
+                    cleaned_text = re.sub(r"\s+", " ", content).strip()
+                else:
+                    cleaned_text = re.sub(r"\s+", " ", core_text).strip()
+
+            tokens = cleaned_text.split()
+            first_n_tokens = tokens[:num_tokens]
+
+            return " ".join(first_n_tokens)
+
+        except requests.exceptions.RequestException as e:
+            print(f"Could not fetch {url}: {e}")
+            return ""
+        except Exception as e:
+            print(f"An error occurred while processing {url}: {e}")
+            return ""
diff --git a/knowledge_database/reddit/reddit.py b/knowledge_database/reddit/reddit.py
@@ -1,6 +1,3 @@
-import requests
-from bs4 import BeautifulSoup
-
 __all__ = ["Reddit"]
 
 
diff --git a/knowledge_database/semanlink/semanlink.py b/knowledge_database/semanlink/semanlink.py
@@ -35,11 +35,9 @@ def __init__(self, urls: typing.List):
         self.urls = urls
 
     def __call__(self):
-
         graph = rdflib.Graph()
         triples = []
         for url in self.urls:
-
             turtle = graph.parse(url, format="turtle")
 
             triples += [
@@ -55,7 +53,6 @@ def __call__(self):
         clean = collections.defaultdict(dict)
 
         for _, metadata in data.items():
-
             valid = True
 
             for relation in [
@@ -66,7 +63,6 @@ def __call__(self):
                 "arxiv_author",
             ]:
                 if relation not in metadata:
-
                     valid = False
                     break
 
diff --git a/knowledge_database/tags/tags.py b/knowledge_database/tags/tags.py
@@ -17,10 +17,8 @@ def get_tags_triples(data: typing.List, excluded_tags=None):
     seen = collections.defaultdict(dict)
 
     for _, document in data.items():
-
         tags = document["tags"] + document["extra-tags"]
         for head, tail in itertools.combinations(tags, 2):
-
             if head in excluded_tags or tail in excluded_tags:
                 continue
 
diff --git a/knowledge_database/twitter/twitter.py b/knowledge_database/twitter/twitter.py
@@ -48,7 +48,6 @@ def __call__(self, limit: int = 100):
         next_token = ""
 
         for _ in range(limit):
-
             tweets = requests.get(
                 self.url + next_token, headers={"Authorization": f"Bearer {self.token}"}
             ).json()
diff --git a/knowledge_database/zotero/zotero.py b/knowledge_database/zotero/zotero.py
@@ -42,7 +42,6 @@ def __call__(self, limit: int = 10000):
         data = {}
 
         for idx, document in enumerate(self.client.top(limit=limit)):
-
             date = datetime.datetime.strptime(
                 document["data"]["dateAdded"], "%Y-%m-%dT%H:%M:%SZ"
             ).strftime("%Y-%m-%d")
diff --git a/readme.md b/readme.md
@@ -12,7 +12,7 @@
 <img src="img/demo.gif" alt="Demonstration GIF" style="width:100%; border-radius:10px; box-shadow:0 4px 8px rgba(0,0,0,0.1);">
 </p>
 
-**Knowledge** is a web application that automatically transforms the digital footprint into a personal search engine. It fetches content you interact with from various platforms—**GitHub**, **HackerNews**, and **Zotero**—and organizes it into a navigable knowledge graph.
+**Knowledge** is a web application that automatically transforms the digital footprint into a personal search engine. It fetches content you interact with from various platforms—**GitHub**, **HackerNews**, **Zotero**, **HuggingFace likes** and organizes it into a navigable knowledge graph.
 
 ---
 
@@ -36,6 +36,7 @@ A GitHub Actions workflow runs twice a day to perform the following tasks:
     - GitHub Stars
     - HackerNews Upvotes
     - Zotero Records
+    - HuggingFace Likes
 2.  **Processes and Stores Data** in the `database/` directory:
     - `database.json`: Contains all the raw records.
     - `triples.json`: Stores the knowledge graph data (topics and relationships).
diff --git a/requirements.txt b/requirements.txt
@@ -14,5 +14,9 @@ scikit-learn == 1.5.0
 openai == 1.35.2
 orjson == 3.9.15
 lenlp == 1.1.0
+sentence_transformers == 5.1.2
 numpy == 2.0.0
 trafilatura == 2.0.0
+transformers == 4.56.2
+huggingface-hub==0.36.0 
+tokenizers==0.22.1 
diff --git a/run.py b/run.py
@@ -7,6 +7,7 @@
 from knowledge_database import (
     github,
     hackernews,
+    huggingface,
     pipeline,
     semanlink,
     tags,
@@ -22,6 +23,7 @@
 hackernews_password = os.environ.get("HACKERNEWS_PASSWORD")
 zotero_library_id = os.environ.get("ZOTERO_LIBRARY_ID")
 zotero_api_key = os.environ.get("ZOTERO_API_KEY")
+huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 
 data = {}
 
@@ -108,6 +110,19 @@
     print("Semanlink disabled.")
 
 
+# HuggingFace
+if huggingface_token is not None and sources.get("huggingface") is not None:
+    print("HuggingFace knowledge.")
+    knowledge = huggingface.HuggingFace(token=huggingface_token)
+    knowledge = {
+        url: document for url, document in knowledge().items() if url not in data
+    }
+    print(f"Found {len(knowledge)} new HuggingFace documents.")
+    data = {**data, **knowledge}
+else:
+    print("No HuggingFace token.")
+
+
 # Sanity check.
 for url, document in data.items():
     for field in ["title", "tags", "summary", "date"]:
diff --git a/sources.yml b/sources.yml
@@ -7,4 +7,6 @@ github:
 twitter:
   - [1262679654239961088, "raphaelsrty"]
 
-semanlink: False
+semanlink: False
+
+huggingface: True

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .huggingface import HuggingFace`
	`2`	`+`
	`3`	`+__all__ = ["HuggingFace"]`