|
| 1 | +-- Discover sites (hosts or domains) hosting mostly non-English content |
| 2 | +-- |
| 3 | +-- See also |
| 4 | +-- get-records-for-language.sql |
| 5 | +-- for restrictions and limitations regarding the |
| 6 | +-- automatic detection of the content language(s) and |
| 7 | +-- site-discovery-by-language.sql |
| 8 | +-- for a similar use case. |
| 9 | +-- |
| 10 | +-- A "site" is defined by the host name part of a URL. |
| 11 | +-- |
| 12 | +-- Grouping by host name count the number of |
| 13 | +-- - total pages |
| 14 | +-- - pages with English as primary content language |
| 15 | +-- - pages with a primary content "language other than English" (LOTE) |
| 16 | +-- |
| 17 | +-- The result is filtered by a minimum "size" of a site (pages_total) |
| 18 | +-- and a minimum share of non-English ("LOTE") pages. |
| 19 | +-- |
| 20 | +-- For testing and adjusting the filter thresholds it's recommended |
| 21 | +-- to restrict the query on a single top-level domain. |
| 22 | +-- Remove this restriction once you're ready for the entire data. |
| 23 | +-- |
| 24 | +with tmp as ( |
| 25 | +select count(*) as pages_total, |
| 26 | + sum(count(*)) over(partition by url_host_registered_domain) as pages_total_domain, |
| 27 | + -- count pages where the primary content language is not English ("lote" - language other than English) |
| 28 | + sum(case when content_languages like 'eng%' then 0 else case when content_languages is null then 0 else 1 end end) as pages_lote, |
| 29 | + sum(case when content_languages like 'eng%' then 1 else 0 end) as pages_english, |
| 30 | + sum(case when content_languages is null then 1 else 0 end) as pages_language_unknown, |
| 31 | + count(distinct regexp_extract(content_languages, '^([a-z]{3})')) as num_distinct_primary_languages, |
| 32 | + url_host_tld, |
| 33 | + url_host_registered_domain, |
| 34 | + url_host_name, |
| 35 | + cast( |
| 36 | + slice( |
| 37 | + array_sort( |
| 38 | + cast(map_entries(histogram(regexp_extract(content_languages, '^([a-z]{3})'))) |
| 39 | + as array(row(lang varchar, freq bigint))), |
| 40 | + (a, b) -> if(a.freq < b.freq, 1, if(a.freq = b.freq, 0, -1))), |
| 41 | + 1, 10) |
| 42 | + as JSON) as top_10_primary_languages |
| 43 | +from ccindex.ccindex |
| 44 | +where crawl = 'CC-MAIN-2026-12' |
| 45 | + and subset = 'warc' |
| 46 | + -- for testing, restrict to a small, but multi-lingual top-level domain |
| 47 | + and url_host_tld = 'va' |
| 48 | +group by url_host_tld, |
| 49 | + url_host_registered_domain, |
| 50 | + url_host_name) |
| 51 | +select pages_total, |
| 52 | + pages_total_domain, |
| 53 | + pages_lote, |
| 54 | + pages_english, |
| 55 | + pages_language_unknown, |
| 56 | + (100.0*pages_lote/pages_total) as perc_lote, |
| 57 | + (100.0*pages_english/pages_total) as perc_english, |
| 58 | + num_distinct_primary_languages, |
| 59 | + url_host_tld, |
| 60 | + url_host_registered_domain, |
| 61 | + url_host_name, |
| 62 | + top_10_primary_languages |
| 63 | +from tmp |
| 64 | +where pages_total >= 10 |
| 65 | + and pages_lote >= 5 |
| 66 | + and (1.0*pages_lote/pages_total) >= .3 |
| 67 | +order by pages_lote desc; |
0 commit comments