Skip to content

Commit

Permalink
Data validation
Browse files Browse the repository at this point in the history
- aggregate/split to correct coma separation character
- modified a #REF! value in value_minus_unit_val_x_qty column in sources/Local/Archives_de_la_CCI_de_Marseille_I_32_Marseille_Imports_1749.csv
- removed unused deps in split script
  • Loading branch information
paulgirard committed Jan 2, 2023
1 parent 72d37ac commit 3e1cce7
Show file tree
Hide file tree
Showing 7 changed files with 3,200 additions and 3,222 deletions.
1 change: 1 addition & 0 deletions datapackage.json
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,7 @@
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1778.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1779.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1780.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_32_Marseille_Imports_1749.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1767.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1769.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1774.csv",
Expand Down
148 changes: 63 additions & 85 deletions scripts/datapackage_validation.json
Original file line number Diff line number Diff line change
@@ -1,115 +1,92 @@
{
"version": "4.29.0",
"time": 74.294,
"time": 73.84,
"errors": [],
"tasks": [
{
"resource": {
"name": "classifications_index",
"path": "base/classifications_index.csv",
"description": "This table list the classifications available to agregate country and product. It describes the hierarchy of classification which one beeing based on another one. The index does not contain the classification data which are in dedicated files whose name are standardized as base/classification_{model}_{slug}.csv. Those files are not in the datapackage because their structure moves with scientists usage...",
"fields": [
{
"name": "slug",
"type": "string",
"constraints": {
"required": true,
"unique": true
},
"description": "classification slug name used to uniquely identify the classification. The slug is used as variable name in classification tables."
},
{
"name": "model",
"type": "string",
"constraints": {
"required": true,
"enum": [
"partner",
"product"
]
},
"description": "The type of object which are classified. It's the data field in the flow resource which is targeted by the classification."
},
{
"name": "parentSlug",
"type": "string",
"description": "Slug of the parent classification. Left blank for the source which are the root of classification tree."
},
{
"name": "name",
"type": "string",
"constraints": {
"required": true
},
"description": "Human readable name of the classification"
},
{
"name": "author",
"type": "string",
"constraints": {
"required": true
},
"description": "Classification author name.s"
},
{
"name": "description",
"type": "string",
"constraints": {
"required": true
},
"description": "Classification description"
}
],
"foreignKeys": [
{
"fields": "parentSlug",
"reference": {
"resource": "classifications_index",
"fields": "slug"
}
}
],
"profile": "tabular-data-resource",
"scheme": "file",
"format": "csv",
"hashing": "md5",
"stats": {
"hash": "51a4e4f4d9fbc4011eaa5709be0f7d73",
"bytes": 4437,
"fields": 6,
"rows": 34
},
"encoding": "utf-8",
"schema": {
"fields": [
{
"name": "slug",
"type": "string",
"name": "slug"
"constraints": {
"required": true
},
"description": "classification slug name used to uniquely identify the classification. The slug is used as variable name in classification tables."
},
{
"name": "model",
"type": "string",
"name": "model"
"constraints": {
"required": true,
"enum": [
"partner",
"product"
]
},
"description": "The type of object which are classified. It's the data field in the flow resource which is targeted by the classification."
},
{
"name": "parentSlug",
"type": "string",
"name": "parentSlug"
"description": "Slug of the parent classification. Left blank for the source which are the root of classification tree."
},
{
"name": "name",
"type": "string",
"name": "name"
"constraints": {
"required": true
},
"description": "Human readable name of the classification"
},
{
"name": "author",
"type": "string",
"name": "author"
"constraints": {
"required": true
},
"description": "Classification author name.s"
},
{
"name": "description",
"type": "string",
"name": "description"
"constraints": {
"required": true
},
"description": "Classification description"
}
],
"primaryKey": [
"slug",
"model"
],
"foreignKeys": [
{
"fields": "parentSlug",
"reference": {
"resource": "classifications_index",
"fields": "slug"
}
}
]
}
},
"profile": "tabular-data-resource",
"scheme": "file",
"format": "csv",
"hashing": "md5",
"stats": {
"hash": "51a4e4f4d9fbc4011eaa5709be0f7d73",
"bytes": 4437,
"fields": 6,
"rows": 34
},
"encoding": "utf-8"
},
"time": 0.029,
"time": 0.051,
"scope": [
"hash-count-error",
"byte-count-error",
Expand Down Expand Up @@ -944,6 +921,7 @@
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1778.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1779.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1780.csv",
"sources/Local/Archives_de_la_CCI_de_Marseille_I_32_Marseille_Imports_1749.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1767.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1769.csv",
"sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1774.csv",
Expand Down Expand Up @@ -1305,13 +1283,13 @@
"scheme": "multipart",
"hashing": "md5",
"stats": {
"hash": "c124c026a55fd2d8fa1013380ee363dd",
"bytes": 123935764,
"hash": "a5353449672a88a7e512d06adc9c2f99",
"bytes": 124263766,
"fields": 35,
"rows": 569548
"rows": 570936
}
},
"time": 74.136,
"time": 73.648,
"scope": [
"hash-count-error",
"byte-count-error",
Expand Down
5 changes: 3 additions & 2 deletions scripts/desagregate_bdd_centrale_in_sources.csv
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ filepath,nb_line,diff_nb_line,sources,source_types
../sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1778.csv,632,0,Archives de la CCI de Marseille - I 31,Local
../sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1779.csv,735,0,Archives de la CCI de Marseille - I 31,Local
../sources/Local/Archives_de_la_CCI_de_Marseille_I_31_Marseille_Imports_1780.csv,765,0,Archives de la CCI de Marseille - I 31,Local
../sources/Local/Archives_de_la_CCI_de_Marseille_I_32_Marseille_Imports_1749.csv,979,0,Archives de la CCI de Marseille - I 32,Local
../sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1767.csv,92,0,Archives de la CCI de Rouen Carton VIII 110,Local
../sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1769.csv,62,0,Archives de la CCI de Rouen Carton VIII 110,Local
../sources/Local/Archives_de_la_CCI_de_Rouen_Carton_VIII_110_Caen_Exports_1774.csv,82,0,Archives de la CCI de Rouen Carton VIII 110,Local
Expand Down Expand Up @@ -657,7 +658,7 @@ filepath,nb_line,diff_nb_line,sources,source_types
../sources/National toutes directions partenaires manquants/AN_F12_1666_Nantes_Exports_1789.csv,517,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Narbonne_Exports_1789.csv,227,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Passeports_Exports_1789.csv,113,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Rouen_Exports_1789.csv,933,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Rouen_Exports_1789.csv,1009,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Saint-Malo_Exports_1789.csv,217,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Saint-Quentin_Exports_1789.csv,10,0,AN F12 1666,National toutes directions partenaires manquants
../sources/National toutes directions partenaires manquants/AN_F12_1666_Soissons_Exports_1789.csv,14,0,AN F12 1666,National toutes directions partenaires manquants
Expand Down Expand Up @@ -901,7 +902,7 @@ filepath,nb_line,diff_nb_line,sources,source_types
../sources/Résumé/AN_F12_251_Imports_1818.csv,1625,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_1819.csv,1544,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_1820.csv,1638,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_1821.csv,1213,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_1821.csv,1546,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_An_10.csv,1296,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_An_11.csv,1191,0,AN F12 251,Résumé
../sources/Résumé/AN_F12_251_Imports_An_12.csv,1139,0,AN F12 251,Résumé
Expand Down
2 changes: 0 additions & 2 deletions scripts/split_bdd_centrale_in_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
import itertools
import os
from csv import DictReader, writer, DictWriter
import re
import json
import collections
import shutil
import csv
import unidecode

WRITE = True
VERBOSE = True
Expand Down
Loading

0 comments on commit 3e1cce7

Please sign in to comment.