-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathget-language-translations-url-path.sql
31 lines (31 loc) · 1.29 KB
/
get-language-translations-url-path.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
-- detect domains with language translations by URL path
-- - only for URLs in the Vatican State (.va) top-level domain
-- (a small but multi-lingual TLD for demonstration purposes)
-- - extract two-letter ISO-639-1 language codes in the URL path
-- (using a rather simplistic regular expression)
-- - and show the domain with
-- - page count
-- - number of distinct languages
-- - number of pages per language as map/histogram
-- - only output domains with at least 100 pages and
-- at least one language code in the URL path
--
-- The idea was taken from
-- - Resnik/Smith 2003: The Web as a Parallel Corpus,
-- http://www.aclweb.org/anthology/J03-3002.pdf
-- - Buck 2015: Corpus Acquisition from the Interwebs,
-- http://mt-class.org/jhu-2015/slides/lecture-crawling.pdf
--
SELECT url_host_registered_domain AS domain,
COUNT(DISTINCT(url_path_lang)) as n_lang,
COUNT(*) as n_pages,
histogram(url_path_lang) as lang_counts
FROM "ccindex"."ccindex",
UNNEST(regexp_extract_all(url_path, '(?<=/)(?:[a-z][a-z])(?=/)')) AS t (url_path_lang)
WHERE crawl = 'CC-MAIN-2018-05'
AND subset = 'warc'
AND url_host_registry_suffix = 'va'
GROUP BY url_host_registered_domain
HAVING COUNT(*) >= 100
AND COUNT(DISTINCT(url_path_lang)) >= 1
ORDER BY n_pages DESC;