Skip to content

Commit 6d3bda0

Browse files
committed
Add HuggingFace likes
1 parent bcda31f commit 6d3bda0

File tree

13 files changed

+170
-19
lines changed

13 files changed

+170
-19
lines changed

.github/workflows/database.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on:
55
branches:
66
- main
77
schedule:
8-
- cron: '0 1 * * *'
8+
- cron: "0 1 * * *"
99

1010
jobs:
1111
build:
@@ -23,7 +23,7 @@ jobs:
2323
- name: setup python
2424
uses: actions/setup-python@v2
2525
with:
26-
python-version: '3.10'
26+
python-version: "3.10"
2727

2828
- name: install python packages
2929
run: |
@@ -40,6 +40,7 @@ jobs:
4040
HACKERNEWS_USERNAME: ${{ secrets.HACKERNEWS_USERNAME }}
4141
ZOTERO_API_KEY: ${{ secrets.ZOTERO_API_KEY }}
4242
ZOTERO_LIBRARY_ID: ${{ secrets.ZOTERO_LIBRARY_ID }}
43+
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
4344
run: python run.py
4445

4546
- name: commit files

knowledge_database/hackernews/hackernews.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import datetime
2-
import json
32
import re
43

54
import requests
6-
import tqdm
75
import trafilatura
86
from bs4 import BeautifulSoup
97

@@ -43,7 +41,6 @@ def __call__(self):
4341
data = {}
4442

4543
with requests.Session() as session:
46-
4744
p = session.post(
4845
"https://news.ycombinator.com/login?goto=news",
4946
data={"acct": self.username, "pw": self.password},
@@ -59,7 +56,6 @@ def __call__(self):
5956
soup = BeautifulSoup(html, "html.parser")
6057

6158
for entry in soup.find_all("td", class_="title"):
62-
6359
record = entry.find("a")
6460
if record is None:
6561
continue
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .huggingface import HuggingFace
2+
3+
__all__ = ["HuggingFace"]
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import datetime
2+
import re
3+
4+
import requests
5+
import trafilatura
6+
from huggingface_hub import HfApi
7+
8+
__all__ = ["HuggingFace"]
9+
10+
11+
class HuggingFace:
12+
"""HuggingFace liked models, datasets, and spaces.
13+
14+
Parameters
15+
----------
16+
token : str, optional
17+
HuggingFace User Access Token. If not provided, it relies on
18+
local authentication (huggingface-cli login).
19+
"""
20+
21+
def __init__(self, token: str = None):
22+
self.token = token
23+
self.api = HfApi(token=self.token)
24+
25+
def __call__(self):
26+
"""Get liked content on HuggingFace."""
27+
data = {}
28+
29+
try:
30+
likes = self.api.list_liked_repos()
31+
except Exception as e:
32+
print(f"Error fetching likes: {e}")
33+
return data
34+
35+
# Process Models
36+
if hasattr(likes, "models"):
37+
for model in likes.models:
38+
repo_id = model.repo_id if hasattr(model, "repo_id") else str(model)
39+
url = f"https://huggingface.co/{repo_id}"
40+
41+
# Fetch raw README for summary
42+
raw_url = f"https://huggingface.co/{repo_id}/resolve/main/README.md"
43+
self._process_entry(data, url, raw_url, repo_id, "model")
44+
45+
# Process Datasets
46+
if hasattr(likes, "datasets"):
47+
for dataset in likes.datasets:
48+
repo_id = (
49+
dataset.repo_id if hasattr(dataset, "repo_id") else str(dataset)
50+
)
51+
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
52+
53+
# specific logic to find branch for datasets
54+
branch = self._get_default_branch(repo_id, "dataset")
55+
raw_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{branch}/README.md"
56+
57+
self._process_entry(data, dataset_url, raw_url, repo_id, "dataset")
58+
59+
# Process Spaces
60+
if hasattr(likes, "spaces"):
61+
for space in likes.spaces:
62+
repo_id = space.repo_id if hasattr(space, "repo_id") else str(space)
63+
url = f"https://huggingface.co/spaces/{repo_id}"
64+
65+
# specific logic to find branch for spaces
66+
branch = self._get_default_branch(repo_id, "space")
67+
raw_url = f"https://huggingface.co/spaces/{repo_id}/resolve/{branch}/README.md"
68+
69+
self._process_entry(data, url, raw_url, repo_id, "space")
70+
71+
return data
72+
73+
def _get_default_branch(self, repo_id, repo_type):
74+
"""Helper to safely get default branch."""
75+
try:
76+
repo_info = self.api.repo_info(repo_id=repo_id, repo_type=repo_type)
77+
return repo_info.default_branch if repo_info.default_branch else "main"
78+
except Exception:
79+
return "main"
80+
81+
def _process_entry(self, data, data_url, summarization_url, title_suffix, tag_type):
82+
"""Helper to fetch summary and populate data dict."""
83+
print(f"Processing {tag_type}: {title_suffix}")
84+
summary = self.get_summary(summarization_url)
85+
86+
data[data_url] = {
87+
"title": f"🤗 HuggingFace {title_suffix}",
88+
"tags": ["huggingface", tag_type],
89+
"summary": summary,
90+
"date": datetime.datetime.today().strftime("%Y-%m-%d"),
91+
}
92+
93+
@staticmethod
94+
def get_summary(url, num_tokens=50):
95+
"""
96+
Fetches the content from a URL.
97+
If it's a raw Markdown file with YAML frontmatter, it strips the metadata.
98+
Otherwise, it attempts to extract text using trafilatura.
99+
"""
100+
try:
101+
headers = {
102+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
103+
}
104+
response = requests.get(url, headers=headers, timeout=15)
105+
response.raise_for_status()
106+
content = response.text
107+
108+
# Check for YAML frontmatter (starts with ---)
109+
if content.strip().startswith("---"):
110+
# Regex to remove the first block enclosed in ---
111+
# DOTALL allows . to match newlines
112+
cleaned_text = re.sub(
113+
r"^---\n.*?\n---", "", content, count=1, flags=re.DOTALL
114+
)
115+
116+
# Since it is markdown, we might want to strip common markdown syntax
117+
# for a cleaner summary (optional, but makes it readable)
118+
cleaned_text = re.sub(r"[#*`]", "", cleaned_text)
119+
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
120+
121+
else:
122+
# Fallback to trafilatura for non-markdown/HTML content
123+
core_text = trafilatura.extract(content)
124+
if not core_text:
125+
# If trafilatura fails (e.g. on raw text), use raw content
126+
cleaned_text = re.sub(r"\s+", " ", content).strip()
127+
else:
128+
cleaned_text = re.sub(r"\s+", " ", core_text).strip()
129+
130+
tokens = cleaned_text.split()
131+
first_n_tokens = tokens[:num_tokens]
132+
133+
return " ".join(first_n_tokens)
134+
135+
except requests.exceptions.RequestException as e:
136+
print(f"Could not fetch {url}: {e}")
137+
return ""
138+
except Exception as e:
139+
print(f"An error occurred while processing {url}: {e}")
140+
return ""

knowledge_database/reddit/reddit.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
import requests
2-
from bs4 import BeautifulSoup
3-
41
__all__ = ["Reddit"]
52

63

knowledge_database/semanlink/semanlink.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,9 @@ def __init__(self, urls: typing.List):
3535
self.urls = urls
3636

3737
def __call__(self):
38-
3938
graph = rdflib.Graph()
4039
triples = []
4140
for url in self.urls:
42-
4341
turtle = graph.parse(url, format="turtle")
4442

4543
triples += [
@@ -55,7 +53,6 @@ def __call__(self):
5553
clean = collections.defaultdict(dict)
5654

5755
for _, metadata in data.items():
58-
5956
valid = True
6057

6158
for relation in [
@@ -66,7 +63,6 @@ def __call__(self):
6663
"arxiv_author",
6764
]:
6865
if relation not in metadata:
69-
7066
valid = False
7167
break
7268

knowledge_database/tags/tags.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@ def get_tags_triples(data: typing.List, excluded_tags=None):
1717
seen = collections.defaultdict(dict)
1818

1919
for _, document in data.items():
20-
2120
tags = document["tags"] + document["extra-tags"]
2221
for head, tail in itertools.combinations(tags, 2):
23-
2422
if head in excluded_tags or tail in excluded_tags:
2523
continue
2624

knowledge_database/twitter/twitter.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ def __call__(self, limit: int = 100):
4848
next_token = ""
4949

5050
for _ in range(limit):
51-
5251
tweets = requests.get(
5352
self.url + next_token, headers={"Authorization": f"Bearer {self.token}"}
5453
).json()

knowledge_database/zotero/zotero.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def __call__(self, limit: int = 10000):
4242
data = {}
4343

4444
for idx, document in enumerate(self.client.top(limit=limit)):
45-
4645
date = datetime.datetime.strptime(
4746
document["data"]["dateAdded"], "%Y-%m-%dT%H:%M:%SZ"
4847
).strftime("%Y-%m-%d")

readme.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
<img src="img/demo.gif" alt="Demonstration GIF" style="width:100%; border-radius:10px; box-shadow:0 4px 8px rgba(0,0,0,0.1);">
1313
</p>
1414

15-
**Knowledge** is a web application that automatically transforms the digital footprint into a personal search engine. It fetches content you interact with from various platforms—**GitHub**, **HackerNews**, and **Zotero**and organizes it into a navigable knowledge graph.
15+
**Knowledge** is a web application that automatically transforms the digital footprint into a personal search engine. It fetches content you interact with from various platforms—**GitHub**, **HackerNews**, **Zotero**, **HuggingFace likes** and organizes it into a navigable knowledge graph.
1616

1717
---
1818

@@ -36,6 +36,7 @@ A GitHub Actions workflow runs twice a day to perform the following tasks:
3636
- GitHub Stars
3737
- HackerNews Upvotes
3838
- Zotero Records
39+
- HuggingFace Likes
3940
2. **Processes and Stores Data** in the `database/` directory:
4041
- `database.json`: Contains all the raw records.
4142
- `triples.json`: Stores the knowledge graph data (topics and relationships).

0 commit comments

Comments
 (0)