11package core
22
33import (
4+ _ "embed"
5+ "os"
46 "strings"
7+ "sync"
8+
9+ "golang.org/x/net/publicsuffix"
10+ "gopkg.in/yaml.v3"
11+ )
12+
13+ //go:embed enrichment_domains.yaml
14+ var defaultEnrichmentDomainsYAML []byte
15+
16+ type enrichmentDomainsFile struct {
17+ DomainSourceHints map [string ]string `yaml:"domain_source_hints"`
18+ NewsDomains []string `yaml:"news_domains"`
19+ ForumDomains []string `yaml:"forum_domains"`
20+ MarketplaceDomains []string `yaml:"marketplace_domains"`
21+ SocialDomains []string `yaml:"social_domains"`
22+ }
23+
24+ type enrichmentDomainsConfig struct {
25+ DomainSourceHints map [string ]string
26+ NewsDomains map [string ]bool
27+ ForumDomains map [string ]bool
28+ MarketplaceDomains map [string ]bool
29+ SocialDomains map [string ]bool
30+ }
31+
32+ var (
33+ enrichmentDomainsOnce sync.Once
34+ enrichmentDomains enrichmentDomainsConfig
535)
636
737// EnrichDomainInfo derives TLD/category signals from a bare hostname.
@@ -10,24 +40,26 @@ func EnrichDomainInfo(domain string) *DomainInfo {
1040 return nil
1141 }
1242
43+ domain = normalizeDomain (domain )
1344 tld , sld := splitDomain (domain )
45+ cfg := loadEnrichmentDomains ()
1446
1547 info := & DomainInfo {
1648 TLD : tld ,
1749 SLD : sld ,
1850 IsGov : isGovTLD (domain , tld ),
1951 IsEdu : isEduTLD (domain , tld ),
2052 IsMil : isMilTLD (tld ),
21- IsNews : newsDomains [domain ],
22- IsForum : forumDomains [domain ],
23- IsMarketplace : marketplaceDomains [domain ],
24- IsSocial : socialDomains [domain ],
53+ IsNews : cfg . NewsDomains [domain ],
54+ IsForum : cfg . ForumDomains [domain ],
55+ IsMarketplace : cfg . MarketplaceDomains [domain ],
56+ IsSocial : cfg . SocialDomains [domain ],
2557 }
2658 return info
2759}
2860
2961// ClassifyURL returns a rough content-type and source hint derived from the
30- // URL path alone — no network calls.
62+ // URL path alone; no network calls.
3163func ClassifyURL (rawURL , domain string ) * Classification {
3264 if rawURL == "" && domain == "" {
3365 return nil
@@ -42,46 +74,42 @@ func ClassifyURL(rawURL, domain string) *Classification {
4274 }
4375}
4476
45- // splitDomain returns (tld, sld) for a bare hostname.
46- // Uses a simple heuristic: last label is TLD, second-to-last is SLD.
47- // For compound TLDs like co.uk the full suffix is returned as TLD.
77+ // splitDomain returns (public suffix, registrable domain label).
4878func splitDomain (domain string ) (tld , sld string ) {
49- parts := strings . Split (domain , "." )
50- if len ( parts ) < 2 {
51- return domain , ""
79+ domain = normalizeDomain (domain )
80+ if domain == "" {
81+ return "" , ""
5282 }
53- // Known compound TLDs.
54- compoundTLDs := map [string ]bool {
55- "co.uk" : true , "co.jp" : true , "co.in" : true , "co.nz" : true ,
56- "co.za" : true , "com.au" : true , "com.br" : true , "com.mx" : true ,
57- "gov.uk" : true , "ac.uk" : true , "edu.au" : true , "gov.au" : true ,
58- "or.jp" : true , "ne.jp" : true ,
83+
84+ suffix , icann := publicsuffix .PublicSuffix (domain )
85+ if suffix == "" || ! icann {
86+ parts := strings .Split (domain , "." )
87+ if len (parts ) < 2 {
88+ return domain , ""
89+ }
90+ return parts [len (parts )- 1 ], parts [len (parts )- 2 ]
5991 }
60- if len (parts ) >= 3 {
61- compound := parts [len (parts )- 2 ] + "." + parts [len (parts )- 1 ]
62- if compoundTLDs [compound ] {
63- return compound , parts [len (parts )- 3 ]
92+
93+ registrable , err := publicsuffix .EffectiveTLDPlusOne (domain )
94+ if err != nil {
95+ parts := strings .Split (domain , "." )
96+ if len (parts ) < 2 {
97+ return suffix , ""
6498 }
99+ return suffix , parts [len (parts )- 2 ]
65100 }
66- return parts [len (parts )- 1 ], parts [len (parts )- 2 ]
101+
102+ sld = strings .TrimSuffix (registrable , "." + suffix )
103+ return suffix , sld
67104}
68105
69106func isGovTLD (domain , tld string ) bool {
70- if tld == "gov" || tld == "gov.uk" || tld == "gov.au" {
71- return true
72- }
73- return strings .HasSuffix (domain , ".gov" ) ||
74- strings .HasSuffix (domain , ".gov.uk" ) ||
75- strings .HasSuffix (domain , ".gov.au" )
107+ return tld == "gov" || strings .HasSuffix (tld , ".gov" ) || strings .HasSuffix (domain , ".gov" )
76108}
77109
78110func isEduTLD (domain , tld string ) bool {
79- if tld == "edu" || tld == "ac.uk" || tld == "edu.au" {
80- return true
81- }
82- return strings .HasSuffix (domain , ".edu" ) ||
83- strings .HasSuffix (domain , ".ac.uk" ) ||
84- strings .HasSuffix (domain , ".edu.au" )
111+ return tld == "edu" || strings .HasSuffix (tld , ".edu" ) || tld == "ac.uk" ||
112+ strings .HasSuffix (domain , ".edu" ) || strings .HasSuffix (domain , ".ac.uk" )
85113}
86114
87115func isMilTLD (tld string ) bool {
@@ -111,81 +139,65 @@ func classifyContentType(rawURL string) string {
111139}
112140
113141func classifySourceHint (domain string ) string {
114- if hint , ok := domainSourceHints [domain ]; ok {
142+ cfg := loadEnrichmentDomains ()
143+ if hint , ok := cfg .DomainSourceHints [normalizeDomain (domain )]; ok {
115144 return hint
116145 }
117146 return ""
118147}
119148
120- // domainSourceHints maps known domains to a descriptive source hint.
121- var domainSourceHints = map [string ]string {
122- "wikipedia.org" : "encyclopedia" ,
123- "en.wikipedia.org" : "encyclopedia" ,
124- "github.com" : "code_repository" ,
125- "gitlab.com" : "code_repository" ,
126- "stackoverflow.com" : "qa_forum" ,
127- "stackexchange.com" : "qa_forum" ,
128- "reddit.com" : "social_forum" ,
129- "nytimes.com" : "news" ,
130- "bbc.com" : "news" ,
131- "bbc.co.uk" : "news" ,
132- "reuters.com" : "news" ,
133- "theguardian.com" : "news" ,
134- "washingtonpost.com" : "news" ,
135- "forbes.com" : "news" ,
136- "techcrunch.com" : "news" ,
137- "medium.com" : "blog_platform" ,
138- "scholar.google.com" : "academic" ,
139- "arxiv.org" : "academic" ,
140- "pubmed.ncbi.nlm.nih.gov" : "academic" ,
141- "amazon.com" : "marketplace" ,
142- "ebay.com" : "marketplace" ,
143- "etsy.com" : "marketplace" ,
144- "docs.google.com" : "document" ,
145- "youtube.com" : "video_platform" ,
146- "vimeo.com" : "video_platform" ,
147- "twitter.com" : "social_media" ,
148- "x.com" : "social_media" ,
149- "facebook.com" : "social_media" ,
150- "linkedin.com" : "professional_network" ,
151- "instagram.com" : "social_media" ,
149+ func loadEnrichmentDomains () enrichmentDomainsConfig {
150+ enrichmentDomainsOnce .Do (func () {
151+ enrichmentDomains = parseEnrichmentDomains (defaultEnrichmentDomainsYAML )
152+ if path := strings .TrimSpace (os .Getenv ("OPENSERP_ENRICHMENT_DOMAINS_FILE" )); path != "" {
153+ if data , err := os .ReadFile (path ); err == nil {
154+ enrichmentDomains = parseEnrichmentDomains (data )
155+ }
156+ }
157+ })
158+ return enrichmentDomains
152159}
153160
154- // newsDomains is the set of known news publisher domains.
155- var newsDomains = map [string ]bool {
156- "nytimes.com" : true , "bbc.com" : true , "bbc.co.uk" : true ,
157- "reuters.com" : true , "apnews.com" : true , "theguardian.com" : true ,
158- "washingtonpost.com" : true , "forbes.com" : true , "techcrunch.com" : true ,
159- "wired.com" : true , "bloomberg.com" : true , "cnn.com" : true ,
160- "nbcnews.com" : true , "cbsnews.com" : true , "abcnews.go.com" : true ,
161- "foxnews.com" : true , "theverge.com" : true , "engadget.com" : true ,
162- "arstechnica.com" : true , "zdnet.com" : true , "venturebeat.com" : true ,
163- "axios.com" : true , "politico.com" : true , "theatlantic.com" : true ,
164- "economist.com" : true , "ft.com" : true , "wsj.com" : true ,
165- "usatoday.com" : true , "latimes.com" : true , "nypost.com" : true ,
166- }
161+ func parseEnrichmentDomains (data []byte ) enrichmentDomainsConfig {
162+ cfg := enrichmentDomainsConfig {
163+ DomainSourceHints : map [string ]string {},
164+ NewsDomains : map [string ]bool {},
165+ ForumDomains : map [string ]bool {},
166+ MarketplaceDomains : map [string ]bool {},
167+ SocialDomains : map [string ]bool {},
168+ }
169+
170+ var file enrichmentDomainsFile
171+ if err := yaml .Unmarshal (data , & file ); err != nil {
172+ return cfg
173+ }
174+
175+ for domain , hint := range file .DomainSourceHints {
176+ domain = normalizeDomain (domain )
177+ hint = strings .TrimSpace (hint )
178+ if domain != "" && hint != "" {
179+ cfg .DomainSourceHints [domain ] = hint
180+ }
181+ }
182+ fillDomainSet (cfg .NewsDomains , file .NewsDomains )
183+ fillDomainSet (cfg .ForumDomains , file .ForumDomains )
184+ fillDomainSet (cfg .MarketplaceDomains , file .MarketplaceDomains )
185+ fillDomainSet (cfg .SocialDomains , file .SocialDomains )
167186
168- // forumDomains is the set of known community/forum domains.
169- var forumDomains = map [string ]bool {
170- "reddit.com" : true , "news.ycombinator.com" : true ,
171- "stackoverflow.com" : true , "stackexchange.com" : true ,
172- "superuser.com" : true , "serverfault.com" : true ,
173- "quora.com" : true , "discourse.org" : true ,
174- "boards.4chan.org" : true , "hackernews.com" : true ,
187+ return cfg
175188}
176189
177- // marketplaceDomains is the set of known e-commerce/marketplace domains.
178- var marketplaceDomains = map [string ]bool {
179- "amazon.com" : true , "amazon.co.uk" : true , "amazon.de" : true ,
180- "ebay.com" : true , "etsy.com" : true , "walmart.com" : true ,
181- "target.com" : true , "bestbuy.com" : true , "newegg.com" : true ,
182- "aliexpress.com" : true , "alibaba.com" : true , "shopify.com" : true ,
190+ func fillDomainSet (dst map [string ]bool , domains []string ) {
191+ for _ , domain := range domains {
192+ domain = normalizeDomain (domain )
193+ if domain != "" {
194+ dst [domain ] = true
195+ }
196+ }
183197}
184198
185- // socialDomains is the set of known social media platform domains.
186- var socialDomains = map [string ]bool {
187- "twitter.com" : true , "x.com" : true , "facebook.com" : true ,
188- "instagram.com" : true , "tiktok.com" : true , "snapchat.com" : true ,
189- "pinterest.com" : true , "tumblr.com" : true , "linkedin.com" : true ,
190- "youtube.com" : true , "twitch.tv" : true , "discord.com" : true ,
199+ func normalizeDomain (domain string ) string {
200+ domain = strings .ToLower (strings .TrimSpace (domain ))
201+ domain = strings .TrimPrefix (domain , "www." )
202+ return strings .TrimSuffix (domain , "." )
191203}
0 commit comments