Skip to content

Commit d47bf08

Browse files
committed
fix api response format issues
1 parent 1bdb103 commit d47bf08

16 files changed

Lines changed: 661 additions & 288 deletions

baidu/search.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,10 +287,19 @@ func (baid *Baidu) SearchImage(ctx context.Context, query core.Query) ([]core.Se
287287
continue
288288
}
289289
res := core.SearchResult{
290-
Rank: (searchPage * 30) + (i + 1),
291-
URL: img.URL[0].Original,
292-
Title: img.Title,
293-
Description: fmt.Sprintf("%v,%v,%vx%x,copyright:%v", img.PictureDate, img.Type, img.Height, img.Width, img.IsCopyright),
290+
Rank: (searchPage * 30) + (i + 1),
291+
URL: img.URL[0].Original,
292+
Title: img.Title,
293+
Description: fmt.Sprintf(
294+
"Source Page: %s, thumb_url:%s, %dx%d, date:%v, type:%v, copyright:%v",
295+
img.URL[0].SourcePage,
296+
img.ThumbURL,
297+
img.Width,
298+
img.Height,
299+
img.PictureDate,
300+
img.Type,
301+
img.IsCopyright,
302+
),
294303
Ad: func() bool {
295304
if img.AdType != "0" {
296305
return true

bing/search.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -382,12 +382,14 @@ func (bing *Bing) SearchImage(ctx context.Context, query core.Query) ([]core.Sea
382382
// Extract information from the parsed data
383383
srchRes.Title = imgData.T
384384
srchRes.URL = imgData.IMGURL
385-
srchRes.Description = imgData.Desc
386-
387-
// Add dimensions to description if available
388-
if imgData.W > 0 && imgData.H > 0 {
389-
srchRes.Description += fmt.Sprintf(" (%dx%d)", imgData.W, imgData.H)
390-
}
385+
srchRes.Description = fmt.Sprintf(
386+
"%s Source Page: %s, thumb_url:%s, %dx%d",
387+
imgData.Desc,
388+
imgData.PURL,
389+
imgData.TURL,
390+
imgData.W,
391+
imgData.H,
392+
)
391393

392394
// Get the page URL
393395
if imgData.MURL != "" {

cmd/root.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
)
1616

1717
const (
18-
version = "0.6.7"
18+
version = "0.7.0"
1919
defaultConfigFilename = "config"
2020
envPrefix = "OPENSERP"
2121
)

core/clusters.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package core
22

33
import (
4-
"crypto/sha256"
4+
"crypto/md5"
55
"encoding/hex"
66
"sort"
77
)
@@ -17,12 +17,12 @@ func BuildClusters(results []Result, enginesQueried int) []Cluster {
1717
}
1818

1919
type clusterAccum struct {
20-
occurrences []ClusterOccurrence
21-
scoreSum float64
22-
bestRank int
23-
title string
20+
occurrences []ClusterOccurrence
21+
scoreSum float64
22+
bestRank int
23+
title string
2424
canonicalURL string
25-
domain string
25+
domain string
2626
}
2727

2828
// Group by result ID (which is derived from normalized URL + engine).
@@ -95,8 +95,8 @@ func BuildClusters(results []Result, enginesQueried int) []Cluster {
9595
}
9696

9797
func buildClusterID(normalizedURL string) string {
98-
h := sha256.Sum256([]byte(normalizedURL))
99-
return "c_" + hex.EncodeToString(h[:12])
98+
h := md5.Sum([]byte(normalizedURL))
99+
return "c_" + hex.EncodeToString(h[:responseIDBytes])
100100
}
101101

102102
func roundScore(s float64) float64 {

core/enrichment_domain.go

Lines changed: 111 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,37 @@
11
package core
22

33
import (
4+
_ "embed"
5+
"os"
46
"strings"
7+
"sync"
8+
9+
"golang.org/x/net/publicsuffix"
10+
"gopkg.in/yaml.v3"
11+
)
12+
13+
//go:embed enrichment_domains.yaml
14+
var defaultEnrichmentDomainsYAML []byte
15+
16+
type enrichmentDomainsFile struct {
17+
DomainSourceHints map[string]string `yaml:"domain_source_hints"`
18+
NewsDomains []string `yaml:"news_domains"`
19+
ForumDomains []string `yaml:"forum_domains"`
20+
MarketplaceDomains []string `yaml:"marketplace_domains"`
21+
SocialDomains []string `yaml:"social_domains"`
22+
}
23+
24+
type enrichmentDomainsConfig struct {
25+
DomainSourceHints map[string]string
26+
NewsDomains map[string]bool
27+
ForumDomains map[string]bool
28+
MarketplaceDomains map[string]bool
29+
SocialDomains map[string]bool
30+
}
31+
32+
var (
33+
enrichmentDomainsOnce sync.Once
34+
enrichmentDomains enrichmentDomainsConfig
535
)
636

737
// EnrichDomainInfo derives TLD/category signals from a bare hostname.
@@ -10,24 +40,26 @@ func EnrichDomainInfo(domain string) *DomainInfo {
1040
return nil
1141
}
1242

43+
domain = normalizeDomain(domain)
1344
tld, sld := splitDomain(domain)
45+
cfg := loadEnrichmentDomains()
1446

1547
info := &DomainInfo{
1648
TLD: tld,
1749
SLD: sld,
1850
IsGov: isGovTLD(domain, tld),
1951
IsEdu: isEduTLD(domain, tld),
2052
IsMil: isMilTLD(tld),
21-
IsNews: newsDomains[domain],
22-
IsForum: forumDomains[domain],
23-
IsMarketplace: marketplaceDomains[domain],
24-
IsSocial: socialDomains[domain],
53+
IsNews: cfg.NewsDomains[domain],
54+
IsForum: cfg.ForumDomains[domain],
55+
IsMarketplace: cfg.MarketplaceDomains[domain],
56+
IsSocial: cfg.SocialDomains[domain],
2557
}
2658
return info
2759
}
2860

2961
// ClassifyURL returns a rough content-type and source hint derived from the
30-
// URL path alone no network calls.
62+
// URL path alone; no network calls.
3163
func ClassifyURL(rawURL, domain string) *Classification {
3264
if rawURL == "" && domain == "" {
3365
return nil
@@ -42,46 +74,42 @@ func ClassifyURL(rawURL, domain string) *Classification {
4274
}
4375
}
4476

45-
// splitDomain returns (tld, sld) for a bare hostname.
46-
// Uses a simple heuristic: last label is TLD, second-to-last is SLD.
47-
// For compound TLDs like co.uk the full suffix is returned as TLD.
77+
// splitDomain returns (public suffix, registrable domain label).
4878
func splitDomain(domain string) (tld, sld string) {
49-
parts := strings.Split(domain, ".")
50-
if len(parts) < 2 {
51-
return domain, ""
79+
domain = normalizeDomain(domain)
80+
if domain == "" {
81+
return "", ""
5282
}
53-
// Known compound TLDs.
54-
compoundTLDs := map[string]bool{
55-
"co.uk": true, "co.jp": true, "co.in": true, "co.nz": true,
56-
"co.za": true, "com.au": true, "com.br": true, "com.mx": true,
57-
"gov.uk": true, "ac.uk": true, "edu.au": true, "gov.au": true,
58-
"or.jp": true, "ne.jp": true,
83+
84+
suffix, icann := publicsuffix.PublicSuffix(domain)
85+
if suffix == "" || !icann {
86+
parts := strings.Split(domain, ".")
87+
if len(parts) < 2 {
88+
return domain, ""
89+
}
90+
return parts[len(parts)-1], parts[len(parts)-2]
5991
}
60-
if len(parts) >= 3 {
61-
compound := parts[len(parts)-2] + "." + parts[len(parts)-1]
62-
if compoundTLDs[compound] {
63-
return compound, parts[len(parts)-3]
92+
93+
registrable, err := publicsuffix.EffectiveTLDPlusOne(domain)
94+
if err != nil {
95+
parts := strings.Split(domain, ".")
96+
if len(parts) < 2 {
97+
return suffix, ""
6498
}
99+
return suffix, parts[len(parts)-2]
65100
}
66-
return parts[len(parts)-1], parts[len(parts)-2]
101+
102+
sld = strings.TrimSuffix(registrable, "."+suffix)
103+
return suffix, sld
67104
}
68105

69106
func isGovTLD(domain, tld string) bool {
70-
if tld == "gov" || tld == "gov.uk" || tld == "gov.au" {
71-
return true
72-
}
73-
return strings.HasSuffix(domain, ".gov") ||
74-
strings.HasSuffix(domain, ".gov.uk") ||
75-
strings.HasSuffix(domain, ".gov.au")
107+
return tld == "gov" || strings.HasSuffix(tld, ".gov") || strings.HasSuffix(domain, ".gov")
76108
}
77109

78110
func isEduTLD(domain, tld string) bool {
79-
if tld == "edu" || tld == "ac.uk" || tld == "edu.au" {
80-
return true
81-
}
82-
return strings.HasSuffix(domain, ".edu") ||
83-
strings.HasSuffix(domain, ".ac.uk") ||
84-
strings.HasSuffix(domain, ".edu.au")
111+
return tld == "edu" || strings.HasSuffix(tld, ".edu") || tld == "ac.uk" ||
112+
strings.HasSuffix(domain, ".edu") || strings.HasSuffix(domain, ".ac.uk")
85113
}
86114

87115
func isMilTLD(tld string) bool {
@@ -111,81 +139,65 @@ func classifyContentType(rawURL string) string {
111139
}
112140

113141
func classifySourceHint(domain string) string {
114-
if hint, ok := domainSourceHints[domain]; ok {
142+
cfg := loadEnrichmentDomains()
143+
if hint, ok := cfg.DomainSourceHints[normalizeDomain(domain)]; ok {
115144
return hint
116145
}
117146
return ""
118147
}
119148

120-
// domainSourceHints maps known domains to a descriptive source hint.
121-
var domainSourceHints = map[string]string{
122-
"wikipedia.org": "encyclopedia",
123-
"en.wikipedia.org": "encyclopedia",
124-
"github.com": "code_repository",
125-
"gitlab.com": "code_repository",
126-
"stackoverflow.com": "qa_forum",
127-
"stackexchange.com": "qa_forum",
128-
"reddit.com": "social_forum",
129-
"nytimes.com": "news",
130-
"bbc.com": "news",
131-
"bbc.co.uk": "news",
132-
"reuters.com": "news",
133-
"theguardian.com": "news",
134-
"washingtonpost.com": "news",
135-
"forbes.com": "news",
136-
"techcrunch.com": "news",
137-
"medium.com": "blog_platform",
138-
"scholar.google.com": "academic",
139-
"arxiv.org": "academic",
140-
"pubmed.ncbi.nlm.nih.gov": "academic",
141-
"amazon.com": "marketplace",
142-
"ebay.com": "marketplace",
143-
"etsy.com": "marketplace",
144-
"docs.google.com": "document",
145-
"youtube.com": "video_platform",
146-
"vimeo.com": "video_platform",
147-
"twitter.com": "social_media",
148-
"x.com": "social_media",
149-
"facebook.com": "social_media",
150-
"linkedin.com": "professional_network",
151-
"instagram.com": "social_media",
149+
func loadEnrichmentDomains() enrichmentDomainsConfig {
150+
enrichmentDomainsOnce.Do(func() {
151+
enrichmentDomains = parseEnrichmentDomains(defaultEnrichmentDomainsYAML)
152+
if path := strings.TrimSpace(os.Getenv("OPENSERP_ENRICHMENT_DOMAINS_FILE")); path != "" {
153+
if data, err := os.ReadFile(path); err == nil {
154+
enrichmentDomains = parseEnrichmentDomains(data)
155+
}
156+
}
157+
})
158+
return enrichmentDomains
152159
}
153160

154-
// newsDomains is the set of known news publisher domains.
155-
var newsDomains = map[string]bool{
156-
"nytimes.com": true, "bbc.com": true, "bbc.co.uk": true,
157-
"reuters.com": true, "apnews.com": true, "theguardian.com": true,
158-
"washingtonpost.com": true, "forbes.com": true, "techcrunch.com": true,
159-
"wired.com": true, "bloomberg.com": true, "cnn.com": true,
160-
"nbcnews.com": true, "cbsnews.com": true, "abcnews.go.com": true,
161-
"foxnews.com": true, "theverge.com": true, "engadget.com": true,
162-
"arstechnica.com": true, "zdnet.com": true, "venturebeat.com": true,
163-
"axios.com": true, "politico.com": true, "theatlantic.com": true,
164-
"economist.com": true, "ft.com": true, "wsj.com": true,
165-
"usatoday.com": true, "latimes.com": true, "nypost.com": true,
166-
}
161+
func parseEnrichmentDomains(data []byte) enrichmentDomainsConfig {
162+
cfg := enrichmentDomainsConfig{
163+
DomainSourceHints: map[string]string{},
164+
NewsDomains: map[string]bool{},
165+
ForumDomains: map[string]bool{},
166+
MarketplaceDomains: map[string]bool{},
167+
SocialDomains: map[string]bool{},
168+
}
169+
170+
var file enrichmentDomainsFile
171+
if err := yaml.Unmarshal(data, &file); err != nil {
172+
return cfg
173+
}
174+
175+
for domain, hint := range file.DomainSourceHints {
176+
domain = normalizeDomain(domain)
177+
hint = strings.TrimSpace(hint)
178+
if domain != "" && hint != "" {
179+
cfg.DomainSourceHints[domain] = hint
180+
}
181+
}
182+
fillDomainSet(cfg.NewsDomains, file.NewsDomains)
183+
fillDomainSet(cfg.ForumDomains, file.ForumDomains)
184+
fillDomainSet(cfg.MarketplaceDomains, file.MarketplaceDomains)
185+
fillDomainSet(cfg.SocialDomains, file.SocialDomains)
167186

168-
// forumDomains is the set of known community/forum domains.
169-
var forumDomains = map[string]bool{
170-
"reddit.com": true, "news.ycombinator.com": true,
171-
"stackoverflow.com": true, "stackexchange.com": true,
172-
"superuser.com": true, "serverfault.com": true,
173-
"quora.com": true, "discourse.org": true,
174-
"boards.4chan.org": true, "hackernews.com": true,
187+
return cfg
175188
}
176189

177-
// marketplaceDomains is the set of known e-commerce/marketplace domains.
178-
var marketplaceDomains = map[string]bool{
179-
"amazon.com": true, "amazon.co.uk": true, "amazon.de": true,
180-
"ebay.com": true, "etsy.com": true, "walmart.com": true,
181-
"target.com": true, "bestbuy.com": true, "newegg.com": true,
182-
"aliexpress.com": true, "alibaba.com": true, "shopify.com": true,
190+
func fillDomainSet(dst map[string]bool, domains []string) {
191+
for _, domain := range domains {
192+
domain = normalizeDomain(domain)
193+
if domain != "" {
194+
dst[domain] = true
195+
}
196+
}
183197
}
184198

185-
// socialDomains is the set of known social media platform domains.
186-
var socialDomains = map[string]bool{
187-
"twitter.com": true, "x.com": true, "facebook.com": true,
188-
"instagram.com": true, "tiktok.com": true, "snapchat.com": true,
189-
"pinterest.com": true, "tumblr.com": true, "linkedin.com": true,
190-
"youtube.com": true, "twitch.tv": true, "discord.com": true,
199+
func normalizeDomain(domain string) string {
200+
domain = strings.ToLower(strings.TrimSpace(domain))
201+
domain = strings.TrimPrefix(domain, "www.")
202+
return strings.TrimSuffix(domain, ".")
191203
}

0 commit comments

Comments
 (0)