wmde
diff --git a/‎.github/workflows/pylint.yml‎
Lines changed: 23 additions & 0 deletions b/‎.github/workflows/pylint.yml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 31 additions & 65 deletions b/‎README.md‎
Lines changed: 31 additions & 65 deletions
diff --git a/‎main.py‎
Lines changed: 26 additions & 19 deletions b/‎main.py‎
Lines changed: 26 additions & 19 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 25 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/Normalizer/JSONNormalizer.py‎
Lines changed: 27 additions & 3 deletions b/‎src/Normalizer/JSONNormalizer.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎src/Normalizer/TTLNormalizer.py‎
Lines changed: 34 additions & 11 deletions b/‎src/Normalizer/TTLNormalizer.py‎
Lines changed: 34 additions & 11 deletions
diff --git a/‎src/Normalizer/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎src/Normalizer/__init__.py‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,23 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+    - name: Analysing the code with pylint
+      run: |
+        pylint $(git ls-files '*.py')
@@ -1,81 +1,47 @@
 # Wikidata Textifier
 
-**Wikidata Textifier** is an API that transforms Wikidata items into compact format for use in LLMs and GenAI applications. It resolves missing labels of properties and claim values by querying the Wikidata Action API, making it efficient and suitable for AI pipelines.
+**Wikidata Textifier** is an API that transforms Wikidata entities into compact outputs for LLM and GenAI use cases.
+It resolves missing labels for properties and claim values using the Wikidata Action API and caches labels to reduce repeated lookups.
 
-🔗 Live API: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/)
+Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/)
+API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs)
 
----
+## Features
 
-## Functionalities
+- Textify Wikidata entities as `json`, `text`, or `triplet`.
+- Resolve labels for linked entities and properties.
+- Cache labels in MariaDB for faster repeated requests.
+- Support multilingual output with fallback language support.
+- Avoid SPARQL and use Wikidata Action API / EntityData endpoints.
 
-- **Textifies** any Wikidata item into a readable or JSON format suitable for LLMs.
-- **Resolves all labels**, including those missing when querying the Wikidata API.
-- **Caches labels** for 90 days to boost performance and reduce API load.
-- **Avoids SPARQL** and uses the Wikidata Action API for better efficiency and compatibility.
-- **Hosted on Toolforge**: [https://wd-textify.toolforge.org/](https://wd-textify.toolforge.org/)
+## Output Formats
 
----
+- `json`: Structured representation with claims (and optionally qualifiers/references).
+- `text`: Readable summary including label, description, aliases, and attributes.
+- `triplet`: Triplet-style lines with labels and IDs for graph-style traversal.
 
-## Formats
-
-- **Text**: A textual representation or summary of the Wikidata item, including its label, description, aliases, and claims. Useful for helping LLMs understand what the item represents.
-- **Triplet**: Outputs each triplet as a structured line, including labels and IDs, but omits descriptions and aliases. Ideal for agentic LLMs to traverse and explore Wikidata.
-- **JSON**: A structured and compact representation of the full item, suitable for custom formats.
-
----
-
-## API Usage
+## API
 
 ### `GET /`
 
-#### Query Parameters
-
-| Name           | Type    | Required | Description                                                                 |
-|----------------|---------|----------|-----------------------------------------------------------------------------|
-| `id`           | string  | Yes      | Wikidata item ID (e.g., `Q42`)                                              |
-| `lang`         | string  | No       | Language code for labels (default: `en`)                                   |
-| `format`         | string    | No       | The format of the response, either 'json', 'text', or 'triplet' (default: `json`) |
-| `external_ids` | bool    | No       | Whether to include external IDs in the output (default: `true`)            |
-| `all_ranks` | bool    | No       | If false, returns ranked preferred statements, falling back to normal when unavailable (default: `false`)            |
-| `references` | bool    | No       | Whether to include references (default: `false`)            |
-| `fallback_lang` | string    | No       | Fallback language code if the preferred language is not available (default: `en`)            |
-
----
-
-## Deploy to Toolforge
-
-1. Shell into the Toolforge system:
-
-```bash
-ssh [UNIX shell username]@login.toolforge.org
-```
-
-2. Switch to tool user account:
-
-```bash
-become wd-textify
-```
-
-3. Build from Git:
-
-```bash
-toolforge build start https://github.com/philippesaade-wmde/WikidataTextifier.git
-```
+#### Query parameters
 
-4. Start the web service:
+| Name | Type | Required | Description |
+|---|---|---|---|
+| `id` | string | Yes | Comma-separated Wikidata IDs (for example: `Q42` or `Q42,Q2`). |
+| `pid` | string | No | Comma-separated property IDs to filter claims (for example: `P31,P279`). |
+| `lang` | string | No | Preferred language code (default: `en`). |
+| `fallback_lang` | string | No | Fallback language code (default: `en`). |
+| `format` | string | No | Output format: `json`, `text`, or `triplet` (default: `json`). |
+| `external_ids` | bool | No | Include `external-id` datatype claims (default: `true`). |
+| `all_ranks` | bool | No | Include all statement ranks instead of preferred/normal filtering (default: `false`). |
+| `qualifiers` | bool | No | Include qualifiers in claim values (default: `true`). |
+| `references` | bool | No | Include references in claim values (default: `false`). |
 
-```bash
-webservice buildservice start --mount all
-```
-
-5. Debugging the web service:
-
-Read the logs:
-```bash
-webservice logs
-```
+#### Example requests
 
-Open the service shell:
 ```bash
-webservice shell
+curl "https://wd-textify.wmcloud.org/?id=Q42"
+curl "https://wd-textify.wmcloud.org/?id=Q42&format=text&lang=en"
+curl "https://wd-textify.wmcloud.org/?id=Q42,Q2&pid=P31,P279&format=triplet"
 ```
@@ -1,14 +1,16 @@
-from fastapi import FastAPI, HTTPException, Query, Request
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi import BackgroundTasks
+"""FastAPI application that exposes Wikidata textification endpoints."""
+
+import os
+import time
 import traceback
+
 import requests
-import time
-import os
+from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request
+from fastapi.middleware.cors import CORSMiddleware
 
-from src.Normalizer import TTLNormalizer, JSONNormalizer
-from src.WikidataLabel import WikidataLabel, LazyLabelFactory
 from src import utils
+from src.Normalizer import JSONNormalizer, TTLNormalizer
+from src.WikidataLabel import LazyLabelFactory, WikidataLabel
 
 # Start Fastapi app
 app = FastAPI(
@@ -34,6 +36,7 @@
 
 @app.on_event("startup")
 async def startup():
+    """Initialize database resources required by the API."""
     WikidataLabel.initialize_database()
 
 @app.get(
@@ -71,22 +74,26 @@ async def get_textified_wd(
     qualifiers: bool = True,
     fallback_lang: str = 'en'
 ):
-    """
-    Retrieve a Wikidata item with all labels or textual representations for an LLM.
+    """Return normalized Wikidata entities in JSON, text, or triplet format.
 
     Args:
-        id (str): The Wikidata item ID (e.g., "Q42").
-        pid (str): Comma-separated list of property IDs to filter claims (e.g., "P31,P279").
-        format (str): The format of the response, either 'json', 'text', or 'triplet'.
-        lang (str): The language code for labels (default is 'en').
-        external_ids (bool): If True, includes external IDs in the response.
-        all_ranks (bool): If True, includes statements of all ranks (preferred, normal, deprecated).
-        references (bool): If True, includes references in the response. (only available in JSON format)
-        qualifiers (bool): If True, includes qualifiers in the response.
-        fallback_lang (str): The fallback language code if the preferred language is not available.
+        request (Request): Incoming request object (currently unused).
+        background_tasks (BackgroundTasks): Background task queue for periodic cache cleanup.
+        id (str): Comma-separated entity IDs (for example, ``"Q42,Q2"``).
+        pid (str): Optional comma-separated property IDs used to filter claims.
+        lang (str): Preferred language code for labels and formatted values.
+        format (str): Output format: ``"json"``, ``"text"``, or ``"triplet"``.
+        external_ids (bool): Whether to include claims with the ``external-id`` datatype.
+        references (bool): Whether to include references in claim values.
+        all_ranks (bool): Whether to include all statement ranks (preferred, normal, deprecated).
+        qualifiers (bool): Whether to include qualifiers in claim values.
+        fallback_lang (str): Fallback language when ``lang`` is unavailable.
 
     Returns:
-        list: A list of dictionaries containing QIDs and the similarity scores.
+        dict[str, object | None]: Mapping of requested QIDs to their normalized payloads.
+
+    Raises:
+        HTTPException: If an entity is not found, an upstream request fails, or internal processing fails.
     """
     try:
         filter_pids = []
 
@@ -13,3 +13,28 @@ dependencies = [
     "sqlalchemy>=2.0.41",
     "uvicorn>=0.35.0",
 ]
+
+[dependency-groups]
+dev = [
+    "ruff>=0.9.0"
+]
+
+[tool.ruff]
+target-version = "py313"
+line-length = 120
+
+[tool.ruff.lint]
+select = [
+    "E",   # pycodestyle errors
+    "F",   # Pyflakes (catches undefined names, unused imports, etc.)
+    "I",   # isort (import sorting)
+    "D",   # pydocstyle (function/class documentation)
+]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.isort]
+known-first-party = [
+    "wikidatasearch"
+]
@@ -1,9 +1,11 @@
+"""Normalize Wikidata Action API JSON into internal textifier objects."""
+
 from __future__ import annotations
 
 from typing import Any, Dict, List, Optional
+
 import requests
 
-from ..WikidataLabel import WikidataLabel, LazyLabelFactory
 from ..Textifier.WikidataTextifier import (
     WikidataClaim,
     WikidataClaimValue,
@@ -14,11 +16,11 @@
     WikidataTime,
 )
 from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text
+from ..WikidataLabel import LazyLabelFactory, WikidataLabel
 
 
 class JSONNormalizer:
-    """Build WikidataEntity + claims tree from Wikidata JSON (wbgetentities style).
-    """
+    """Normalize ``wbgetentities`` JSON into internal textifier objects."""
 
     def __init__(
         self,
@@ -29,6 +31,16 @@ def __init__(
         label_factory: Optional[LazyLabelFactory] = None,
         debug: bool = False,
     ):
+        """Initialize a normalizer for a single entity payload.
+
+        Args:
+            entity_id (str): Entity ID being normalized.
+            entity_json (dict[str, Any]): Raw ``wbgetentities`` JSON for ``entity_id``.
+            lang (str): Preferred language for label selection.
+            fallback_lang (str): Fallback language when ``lang`` is unavailable.
+            label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities.
+            debug (bool): Whether to print additional debug output while parsing.
+        """
         self.entity_id = entity_id
         self.entity_json = entity_json
 
@@ -51,6 +63,18 @@ def normalize(
         qualifiers: bool = True,
         filter_pids: List[str] = [],
     ) -> WikidataEntity:
+        """Normalize the entity JSON payload into a ``WikidataEntity`` tree.
+
+        Args:
+            external_ids (bool): Whether to include ``external-id`` datatype claims.
+            references (bool): Whether to include references for each statement value.
+            all_ranks (bool): Whether to include statements of all ranks.
+            qualifiers (bool): Whether to include qualifiers for statement values.
+            filter_pids (list[str]): Optional allow-list of property IDs to keep.
+
+        Returns:
+            WikidataEntity: Parsed entity object with claims and values.
+        """
         e = self.entity_json
         if not isinstance(e, dict) or "labels" not in e:
             if self.debug:
 
@@ -1,13 +1,14 @@
+"""Normalize Wikidata TTL into internal textifier objects."""
+
 from __future__ import annotations
 
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, List, Optional, Set
-import requests
 
+import requests
 from rdflib import Graph, Literal, Namespace, URIRef
 from rdflib.namespace import RDF, RDFS
 
-from ..WikidataLabel import WikidataLabel, LazyLabelFactory
 from ..Textifier.WikidataTextifier import (
     WikidataClaim,
     WikidataClaimValue,
@@ -18,9 +19,9 @@
     WikidataTime,
 )
 from ..utils import wikidata_geolocation_to_text, wikidata_time_to_text
+from ..WikidataLabel import LazyLabelFactory, WikidataLabel
 
-
-# Namespaces used by Wikidata TTL exports
+# Namespaces used by Wikidata TTL
 WD = Namespace("http://www.wikidata.org/entity/")
 P = Namespace("http://www.wikidata.org/prop/")
 PS = Namespace("http://www.wikidata.org/prop/statement/")
@@ -39,18 +40,18 @@
 
 
 class TTLNormalizer:
-    """Parse a Wikidata Special:EntityData TTL and build a WikidataEntity with claims.
+    """Normalize ``Special:EntityData`` TTL into internal textifier objects.
 
     Label resolution order:
-      1) labels present in TTL
-      2) LazyLabelFactory bulk lookup for the remainder
+        1) Labels present in TTL.
+        2) ``LazyLabelFactory`` bulk lookup for unresolved IDs.
 
     Notes:
-      - Claims are extracted from wd:<Q> p:<P> <statement-node> triples only.
-      - Statement nodes are validated structurally before value extraction.
-      - Special values (somevalue/novalue) are treated as "no main value" when
+        - Claims are extracted from ``wd:<Q> p:<P> <statement-node>`` triples only.
+        - Statement nodes are validated structurally before value extraction.
+        - Special values (somevalue/novalue) are treated as "no main value" when
         neither ps:<pid> nor psv:<pid> is present on the statement node.
-      - Property datatype is read from wikibase:propertyType when available,
+        - Property datatype is read from ``wikibase:propertyType`` when available,
         otherwise inferred from the statement's value nodes when possible.
     """
 
@@ -63,6 +64,16 @@ def __init__(
         label_factory: Optional[LazyLabelFactory] = None,
         debug: bool = False,
     ):
+        """Initialize a normalizer for a single TTL document.
+
+        Args:
+            entity_id (str): Entity ID being normalized.
+            ttl_text (str): Raw TTL document from ``Special:EntityData``.
+            lang (str): Preferred language for label selection.
+            fallback_lang (str): Fallback language when ``lang`` is unavailable.
+            label_factory (LazyLabelFactory | None): Shared lazy label factory for nested entities.
+            debug (bool): Whether to print additional debug output while parsing.
+        """
         self.entity_id = entity_id
         self.g = Graph()
         self.g.parse(data=ttl_text, format="turtle")
@@ -85,6 +96,18 @@ def normalize(
         qualifiers: bool = True,
         filter_pids: List[str] = []
     ) -> WikidataEntity:
+        """Normalize the parsed graph into a ``WikidataEntity`` tree.
+
+        Args:
+            external_ids (bool): Whether to include ``external-id`` datatype claims.
+            references (bool): Whether to include references for each statement value.
+            all_ranks (bool): Whether to include statements of all ranks.
+            qualifiers (bool): Whether to include qualifiers for statement values.
+            filter_pids (list[str]): Optional allow-list of property IDs to keep.
+
+        Returns:
+            WikidataEntity: Parsed entity object with claims and values.
+        """
         # Preload labels found inside TTL so LazyLabelFactory can avoid lookups.
         self.label_factory._resolved_labels = self._build_label_cache_from_ttl()
 
 
@@ -1,2 +1,6 @@
+"""Public exports for normalizer classes."""
+
+from .JSONNormalizer import JSONNormalizer
 from .TTLNormalizer import TTLNormalizer
-from .JSONNormalizer import JSONNormalizer
+
+__all__ = ["JSONNormalizer", "TTLNormalizer"]