From 6af9b665e8a4437fcf174efb5bb16a61ce2008e1 Mon Sep 17 00:00:00 2001 From: Andrea Borruso Date: Sat, 25 Dec 2021 18:05:41 +0100 Subject: [PATCH] aggiunta archiviazione open data italia domani --- .gitignore | 1 + .../script/italiaDomaniDocumenti.sh | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 4e013b8a6..c69d3e64a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ rawdata/ catalogo/monitoraggioSettimanale/script/old_monitoraggioSettimanale.sh .config catalogo/italiaDomaniDocumenti/script/rawdata/* +!catalogo/italiaDomaniDocumenti/output/open-data/rawdata diff --git a/catalogo/italiaDomaniDocumenti/script/italiaDomaniDocumenti.sh b/catalogo/italiaDomaniDocumenti/script/italiaDomaniDocumenti.sh index 5ee337c26..17f1430d3 100755 --- a/catalogo/italiaDomaniDocumenti/script/italiaDomaniDocumenti.sh +++ b/catalogo/italiaDomaniDocumenti/script/italiaDomaniDocumenti.sh @@ -28,18 +28,18 @@ fi scrape <"$folder"/rawdata/tmp.html -be "//div[@class='banner-documents']" | xq -c '.html.body.div[]|{type:.span[0]."#text",author:.span[1]."#text"}' | mlr --json cat -n >"$folder"/rawdata/tmpTypeAuthor.json -<"$folder"/rawdata/tmp.html scrape -be "//div[@data-cmp-hook-search='wrapper']" | xq -c '.html.body.div[]|{ +scrape <"$folder"/rawdata/tmp.html -be "//div[@data-cmp-hook-search='wrapper']" | xq -c '.html.body.div[]|{ date:.div["@data-date"], title:.div.div[0].h4.button.span, path:.div.div[1].div.div[0].a["@href"] - }' > "$folder"/rawdata/tmp_data.json + }' >"$folder"/rawdata/tmp_data.json mlr -I --json cat -n "$folder"/rawdata/tmp_data.json mlr --json join --ul -j n -f "$folder"/rawdata/tmp_data.json then unsparsify then put -S '$datetime = strftime(strptime($date, "%d/%m/%y"),"%Y-%m-%d");$URL="https://italiadomani.gov.it".$path' then sort -r datetime -f title then clean-whitespace then reorder -f date,type,author,title,path,datetime,URL "$folder"/rawdata/tmpTypeAuthor.json >"$folder"/../output/latest.json # estrai tag -<"$folder"/rawdata/tmp.html scrape -be '//div[@class="banner-documents"]' | xq '.html.body.div[]' | mlr --json cat -n then unsparsify then cut -r -f '^(n|.*div.+text.*)$' then reshape -r "div" -o i,v then filter -x -S '$v==""' then cut -x -f i then nest --implode --values --across-records -f v then rename v,tag >"$folder"/rawdata/tmpTag.json +scrape <"$folder"/rawdata/tmp.html -be '//div[@class="banner-documents"]' | xq '.html.body.div[]' | mlr --json cat -n then unsparsify then cut -r -f '^(n|.*div.+text.*)$' then reshape -r "div" -o i,v then filter -x -S '$v==""' then cut -x -f i then nest --implode --values --across-records -f v then rename v,tag >"$folder"/rawdata/tmpTag.json mlr --json join --ul -j n -f "$folder"/../output/latest.json then unsparsify then cut -x -f n "$folder"/rawdata/tmpTag.json >"$folder"/rawdata/tmp_data.json @@ -65,9 +65,16 @@ mlr --csv put -S '$pubDate = strftime(strptime($datetime, "%Y-%m-%d"),"%Y-%m-%dT ogr2ogr -f geoRSS -dsco TITLE="$title" -dsco LINK="$selflinkraw" -dsco DESCRIPTION="$description" "$folder"/../rss.xml "$folder"/../output/rss.csv -oo AUTODETECT_TYPE=YES # open-data -#mkdir -p "$folder"/../output/open-data/rawdata -#cd "$folder"/../output/open-data/rawdata -#mlr --c2n filter -S '$type=="open-data"' then cut -f URL "$folder"/../output/latest.csv | xargs -n 1 -P 8 wget -q +mkdir -p "$folder"/../output/open-data/rawdata +cd "$folder"/../output/open-data/rawdata +mlr --c2n filter -S 'tolower($tag)=~"open"' then cut -f URL "$folder"/../output/latest.csv | while read line; do + # scarica dati con wget e se il file esiste giĆ  rinominalo con suffisso numerico + wget --backups=1 $line +done + +# cancella file di cui wget ha creato copia e che terminano per numero +find "$folder"/../output/open-data/rawdata -name '*.[1-9]' -delete + # #cd "$folder" # @@ -78,5 +85,3 @@ ogr2ogr -f geoRSS -dsco TITLE="$title" -dsco LINK="$selflinkraw" -dsco DESCRIPTI # mlr -I --csv --ifs ";" -N remove-empty-columns then skip-trivial-records "$folder"/../output/open-data/"$nome" # mlr -I --csv -N put -S 'for (k in $*) {$[k] = gsub($[k], "^ +", "")};for (k in $*) {$[k] = gsub($[k], " +$", "")};for (k in $*) {$[k] = gsub($[k], " +", " ")}' "$folder"/../output/open-data/"$nome" #done - -