Skip to content

Commit d0e554c

Browse files
Merge pull request #44 from commoncrawl/discovery-of-non-english-sites
Add example to discover non-English sites
2 parents d5a7f74 + de1d227 commit d0e554c

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ A couple of sample queries are also provided (for the flat schema):
163163
- find correlations between TLD and content language using the log-likelihood ratio: [loglikelihood-language-tld.sql](src/sql/examples/cc-index/loglikelihood-language-tld.sql)
164164
- ... and similar for correlations between content language and character encoding: [correlation-language-charset.sql](src/sql/examples/cc-index/correlation-language-charset.sql)
165165
- discover sites hosting content of specific language(s): [site-discovery-by-language.sql](src/sql/examples/cc-index/site-discovery-by-language.sql)
166+
- discover non-English sites: [discovery-of-non-english-sites](src/sql/examples/cc-index/discovery-of-non-english-sites.sql)
166167
- find multi-lingual domains by analyzing URL paths: [get-language-translations-url-path.sql](src/sql/examples/cc-index/get-language-translations-url-path.sql)
167168
- extract robots.txt records for a list of sites: [get-records-robotstxt.sql](src/sql/examples/cc-index/get-records-robotstxt.sql)
168169

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
-- Discover sites (hosts or domains) hosting mostly non-English content
2+
--
3+
-- See also
4+
-- get-records-for-language.sql
5+
-- for restrictions and limitations regarding the
6+
-- automatic detection of the content language(s) and
7+
-- site-discovery-by-language.sql
8+
-- for a similar use case.
9+
--
10+
-- A "site" is defined by the host name part of a URL.
11+
--
12+
-- Grouping by host name count the number of
13+
-- - total pages
14+
-- - pages with English as primary content language
15+
-- - pages with a primary content "language other than English" (LOTE)
16+
--
17+
-- The result is filtered by a minimum "size" of a site (pages_total)
18+
-- and a minimum share of non-English ("LOTE") pages.
19+
--
20+
-- For testing and adjusting the filter thresholds it's recommended
21+
-- to restrict the query on a single top-level domain.
22+
-- Remove this restriction once you're ready for the entire data.
23+
--
24+
with tmp as (
25+
select count(*) as pages_total,
26+
sum(count(*)) over(partition by url_host_registered_domain) as pages_total_domain,
27+
-- count pages where the primary content language is not English ("lote" - language other than English)
28+
sum(case when content_languages like 'eng%' then 0 else case when content_languages is null then 0 else 1 end end) as pages_lote,
29+
sum(case when content_languages like 'eng%' then 1 else 0 end) as pages_english,
30+
sum(case when content_languages is null then 1 else 0 end) as pages_language_unknown,
31+
count(distinct regexp_extract(content_languages, '^([a-z]{3})')) as num_distinct_primary_languages,
32+
url_host_tld,
33+
url_host_registered_domain,
34+
url_host_name,
35+
cast(
36+
slice(
37+
array_sort(
38+
cast(map_entries(histogram(regexp_extract(content_languages, '^([a-z]{3})')))
39+
as array(row(lang varchar, freq bigint))),
40+
(a, b) -> if(a.freq < b.freq, 1, if(a.freq = b.freq, 0, -1))),
41+
1, 10)
42+
as JSON) as top_10_primary_languages
43+
from ccindex.ccindex
44+
where crawl = 'CC-MAIN-2026-12'
45+
and subset = 'warc'
46+
-- for testing, restrict to a small, but multi-lingual top-level domain
47+
and url_host_tld = 'va'
48+
group by url_host_tld,
49+
url_host_registered_domain,
50+
url_host_name)
51+
select pages_total,
52+
pages_total_domain,
53+
pages_lote,
54+
pages_english,
55+
pages_language_unknown,
56+
(100.0*pages_lote/pages_total) as perc_lote,
57+
(100.0*pages_english/pages_total) as perc_english,
58+
num_distinct_primary_languages,
59+
url_host_tld,
60+
url_host_registered_domain,
61+
url_host_name,
62+
top_10_primary_languages
63+
from tmp
64+
where pages_total >= 10
65+
and pages_lote >= 5
66+
and (1.0*pages_lote/pages_total) >= .3
67+
order by pages_lote desc;

0 commit comments

Comments
 (0)