Skip to content

Commit dfded4b

Browse files
authored
feat: add license match api endpoint to public and operations API (#1557)
1 parent c93c074 commit dfded4b

File tree

16 files changed

+600
-16
lines changed

16 files changed

+600
-16
lines changed

api/.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[flake8]
22
max-line-length = 120
3-
exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,venv,build,feeds_gen,database_gen
3+
exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,venv,.venv,build,feeds_gen,database_gen
44
# Ignored because conflict with black
55
extend-ignore = E203

api/.openapi-generator/FILES

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ src/feeds_gen/models/gbfs_endpoint.py
2525
src/feeds_gen/models/gbfs_feed.py
2626
src/feeds_gen/models/gbfs_validation_report.py
2727
src/feeds_gen/models/gbfs_version.py
28+
src/feeds_gen/models/get_matching_licenses_request.py
2829
src/feeds_gen/models/gtfs_dataset.py
2930
src/feeds_gen/models/gtfs_feed.py
3031
src/feeds_gen/models/gtfs_rt_feed.py
@@ -34,6 +35,7 @@ src/feeds_gen/models/license_base.py
3435
src/feeds_gen/models/license_rule.py
3536
src/feeds_gen/models/license_with_rules.py
3637
src/feeds_gen/models/location.py
38+
src/feeds_gen/models/matching_license.py
3739
src/feeds_gen/models/metadata.py
3840
src/feeds_gen/models/redirect.py
3941
src/feeds_gen/models/search_feed_item_result.py

api/src/feeds/impl/licenses_api_impl.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
from typing import List, Optional
22

33
from feeds_gen.apis.licenses_api_base import BaseLicensesApi
4+
from feeds_gen.models.get_matching_licenses_request import GetMatchingLicensesRequest
45
from feeds_gen.models.license_with_rules import LicenseWithRules
56
from feeds_gen.models.license_base import LicenseBase
7+
from feeds_gen.models.matching_license import MatchingLicense
8+
from shared.common.license_utils import resolve_license
69
from shared.database.database import with_db_session
710
from shared.database_gen.sqlacodegen_models import License as LicenseOrm
811
from feeds.impl.error_handling import raise_http_error
912
from shared.db_models.license_with_rules_impl import LicenseWithRulesImpl
1013
from shared.db_models.license_base_impl import LicenseBaseImpl
14+
from shared.db_models.matching_license_impl import MatchingLicenseImpl
1115

1216

1317
class LicensesApiImpl(BaseLicensesApi):
@@ -42,3 +46,19 @@ def get_licenses(self, limit: int, offset: int, db_session) -> List[LicenseBase]
4246
return [LicenseBaseImpl.from_orm(lic) for lic in results]
4347
except Exception as e:
4448
raise_http_error(500, f"Error retrieving licenses: {e}")
49+
50+
@with_db_session
51+
def get_matching_licenses(
52+
self,
53+
get_matching_licenses_request: GetMatchingLicensesRequest,
54+
db_session,
55+
) -> List[MatchingLicense]:
56+
"""Get the list of matching licenses based on the provided license URL"""
57+
try:
58+
domain_matching_licenses = resolve_license(
59+
license_url=get_matching_licenses_request.license_url,
60+
db_session=db_session,
61+
)
62+
return [MatchingLicenseImpl.from_domain(matching_license) for matching_license in domain_matching_licenses]
63+
except Exception as e:
64+
raise_http_error(500, f"Error retrieving matching licenses: {e}")

api/src/shared/common/db_utils.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -552,9 +552,6 @@ def normalize_url_str(url: str | None) -> str:
552552
u = re.sub(r"^www\.", "", u, flags=re.I)
553553
# remove trailing slashes
554554
u = re.sub(r"/+$", "", u)
555-
if "/" in u:
556-
host, rest = u.split("/", 1)
557-
return host.lower() + "/" + rest
558555
return u.lower()
559556

560557

api/src/shared/common/license_utils.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,36 @@
1212

1313
@dataclass
1414
class MatchingLicense:
15-
"""Response structure for license URL resolution."""
15+
"""Response structure for license URL resolution.
16+
17+
Represents a matched license result from the resolution process, containing
18+
identification, matching metadata, and confidence scoring.
19+
20+
Attributes:
21+
license_id: Unique identifier for the license (typically SPDX ID)
22+
license_url: Original license URL provided for resolution
23+
normalized_url: URL after normalization (lowercased, trimmed, protocol removed)
24+
match_type: Type of match performed. One of:
25+
- 'exact': Direct match found in database
26+
- 'heuristic': Matched via pattern-based rules (CC resolver, common patterns)
27+
- 'fuzzy': Similarity-based match against same-host licenses
28+
- 'none': No match found
29+
confidence: Match confidence score (0.0-1.0)
30+
- 1.0: Exact match
31+
- 0.99: Creative Commons resolved
32+
- 0.95: Pattern heuristic match
33+
- 0.0-1.0: Fuzzy match score based on string similarity
34+
spdx_id: SPDX License Identifier if matched (e.g., 'CC-BY-4.0', 'MIT')
35+
matched_name: Human-readable name of the matched license
36+
matched_catalog_url: Canonical URL from the license catalog/database
37+
matched_source: Source of the match. One of:
38+
- 'db.license': Exact match from database
39+
- 'cc-resolver': Creative Commons license resolver
40+
- 'pattern-heuristics': Generic pattern matching
41+
notes: Additional context about the match (e.g., version normalization, locale detection)
42+
regional_id: Regional/jurisdictional variant identifier for ported licenses
43+
(e.g., 'CC-BY-2.1-jp' for Japan-ported Creative Commons)
44+
"""
1645

1746
license_id: str
1847
license_url: str
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from __future__ import annotations
2+
3+
from typing import Optional
4+
5+
from feeds_gen.models.matching_license import MatchingLicense as ApiMatchingLicense
6+
from shared.common.license_utils import MatchingLicense as DomainMatchingLicense
7+
8+
9+
class MatchingLicenseImpl(ApiMatchingLicense):
10+
"""Adapter between the internal MatchingLicense representation and the OpenAPI model.
11+
12+
This class converts the internal/shared ``MatchingLicense`` object (used by
13+
``shared.common.license_utils``) into the corresponding OpenAPI-generated
14+
``feeds_gen.models.matching_license.MatchingLicense`` model.
15+
"""
16+
17+
@classmethod
18+
def from_domain(
19+
cls,
20+
matching_license: Optional[DomainMatchingLicense],
21+
) -> Optional[ApiMatchingLicense]:
22+
"""Convert a domain ``MatchingLicense`` into the OpenAPI model.
23+
24+
Returns ``None`` if ``matching_license`` is ``None``.
25+
"""
26+
if matching_license is None:
27+
return None
28+
29+
return cls(
30+
license_id=matching_license.license_id,
31+
license_url=matching_license.license_url,
32+
normalized_url=matching_license.normalized_url,
33+
match_type=matching_license.match_type,
34+
confidence=matching_license.confidence,
35+
spdx_id=matching_license.spdx_id,
36+
matched_name=matching_license.matched_name,
37+
matched_catalog_url=matching_license.matched_catalog_url,
38+
matched_source=matching_license.matched_source,
39+
regional_id=matching_license.regional_id,
40+
notes=matching_license.notes,
41+
)

api/tests/integration/test_database.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,8 @@ def test_insert_and_select():
174174
("https://www.example.com/path///", "example.com/path"),
175175
# Host only with scheme and www; trailing slash removed; host lowercased
176176
("http://www.EXAMPLE.com/", "example.com"),
177-
# Path case preserved (only host lowercased)
178-
("https://Example.com/Case/Sensitive", "example.com/Case/Sensitive"),
177+
# All lowercased
178+
("https://Example.com/Case/Sensitive", "example.com/case/sensitive"),
179179
# None becomes empty string
180180
(None, ""),
181181
# Blank / whitespace-only becomes empty string

docs/DatabaseCatalogAPI.yaml

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,36 @@ paths:
394394
application/json:
395395
schema:
396396
$ref: "#/components/schemas/LicenseWithRules"
397-
397+
/v1/licenses:match:
398+
post:
399+
description: Get the list of matching licenses based on the provided license URL
400+
tags:
401+
- "licenses"
402+
operationId: getMatchingLicenses
403+
security:
404+
- ApiKeyAuth: []
405+
requestBody:
406+
description: Payload containing the license URL to match against the database.
407+
required: true
408+
content:
409+
application/json:
410+
schema:
411+
type: object
412+
required:
413+
- license_url
414+
properties:
415+
license_url:
416+
description: The license URL to resolve and match against the database.
417+
type: string
418+
format: url
419+
example: https://creativecommons.org/licenses/by/4.0/deed.nl
420+
responses:
421+
"200":
422+
description: The list of matching licenses if any.
423+
content:
424+
application/json:
425+
schema:
426+
$ref: "#/components/schemas/MatchingLicenses"
398427
components:
399428
schemas:
400429
Redirect:
@@ -1331,6 +1360,87 @@ components:
13311360
items:
13321361
$ref: "#/components/schemas/LicenseBase"
13331362

1363+
MatchingLicense:
1364+
type: object
1365+
description: Matching a license
1366+
properties:
1367+
license_id:
1368+
description: Unique identifier for the license (typically SPDX ID)
1369+
type: string
1370+
example: CC-BY-4.0
1371+
license_url:
1372+
description: Original license URL provided for resolution
1373+
type: string
1374+
example: https://creativecommons.org/licenses/by/4.0/
1375+
normalized_url:
1376+
description: URL after normalization (lowercased, trimmed, protocol removed)
1377+
type: string
1378+
example: creativecommons.org/licenses/by/4.0
1379+
match_type:
1380+
description: >
1381+
Type of match performed. One of:
1382+
- 'exact': Direct match found in database
1383+
- 'heuristic': Matched via pattern-based rules (CC resolver, common patterns)
1384+
- 'fuzzy': Similarity-based match against same-host licenses
1385+
type: string
1386+
example: heuristic
1387+
confidence:
1388+
description: >
1389+
Match confidence score (0.0-1.0), examples:
1390+
- 1.0: Exact match
1391+
- 0.99: Creative Commons resolved
1392+
- 0.95: Pattern heuristic match
1393+
- 0.0-1.0: Fuzzy match score based on string similarity
1394+
type: number
1395+
example: 0.99
1396+
spdx_id:
1397+
description: SPDX License Identifier if matched (e.g., 'CC-BY-4.0', 'MIT')
1398+
type: string
1399+
example: CC-BY-4.0
1400+
matched_name:
1401+
description: Human-readable name of the matched license
1402+
type: string
1403+
example: Creative Commons Attribution 4.0 International
1404+
matched_catalog_url:
1405+
description: Canonical URL from the license catalog/database
1406+
type: string
1407+
example: https://creativecommons.org/licenses/by/4.0/legalcode
1408+
matched_source:
1409+
description: >
1410+
Source of the match. Examples:
1411+
- 'db.license': Exact match from database
1412+
- 'cc-resolver': Creative Commons license resolver
1413+
- 'pattern-heuristics': Generic pattern matching
1414+
type: string
1415+
example: cc-resolver
1416+
notes:
1417+
description: Additional context about the match (e.g., version normalization, locale detection)
1418+
type: string
1419+
example: Detected locale/jurisdiction port 'nl'. SPDX does not list ported CC licenses; using canonical ID.
1420+
regional_id:
1421+
description: >
1422+
Regional/jurisdictional variant identifier for ported licenses
1423+
(e.g., 'CC-BY-2.1-jp' for Japan-ported Creative Commons)
1424+
type: string
1425+
example: CC-BY-4.0-nl
1426+
example:
1427+
license_id: CC-BY-4.0
1428+
license_url: https://creativecommons.org/licenses/by/4.0/deed.nl
1429+
normalized_url: creativecommons.org/licenses/by/4.0
1430+
match_type: heuristic
1431+
confidence: 0.99
1432+
spdx_id: CC-BY-4.0
1433+
matched_name: Creative Commons Attribution 4.0 International
1434+
matched_catalog_url: https://creativecommons.org/licenses/by/4.0/legalcode
1435+
matched_source: cc-resolver
1436+
notes: Detected locale/jurisdiction port 'nl'. SPDX does not list ported CC licenses; using canonical ID.
1437+
regional_id: CC-BY-4.0-nl
1438+
1439+
MatchingLicenses:
1440+
description: List of MatchingLicense
1441+
type: array
1442+
items:
1443+
$ref: "#/components/schemas/MatchingLicense"
13341444

13351445
parameters:
13361446
status:

0 commit comments

Comments
 (0)