This project generates data dumps in JSON and CSV formats.
Before running this project, please ensure that you have the following requirements installed on your machine:
- PostgreSQL: You will need to have PostgreSQL installed. If you don't have it installed, you can download it from the official website: PostgreSQL
- Python 3: You will need to have Python 3 installed. If you don't have it installed, you can download it from the official website: Python
To set up the project, follow these steps:
- Clone the repository:
git clone [email protected]:datacite/corpus-data-file.git
- Navigate to the project directory:
cd corpus-data-file
- Create a
.env
file:Add your database credentials in thecp .env.example .env
.env
file.
To make the scripts executable, run:
chmod +x ./export-script/create_assertion_formatted_table.sh
Run the script to create a table with formatted data:
./export-script/create_assertion_formatted_table.sh
Run the script to generate the data dump files:
./export-script/generate_assertion_details.sh
-
Create Multiple SQL Queries: Create multiple SQL queries to create a table and populate it with related formatted data following the spec document.
-
Automate Table Creation: Create a bash script create_assertion_formatted_table.sh to automate the creation of the table.
-
Generate Data Dump Files: Create a bash script generate_assertion_details.sh to generate the data dump files. This will create JSON dump files from the formatted table and convert each individual file to CSV using a Python script convert_to_csv.py following the spec document.
This script is used to validate the accession numbers in our database against a set of regular expressions for each repository.
- Ensure you have Python 3 installed on your system.
- Navigate to the script directory:
cd accession_number_validation
- Install the required Python packages:
pip install -r requirements.txt
- Create a
.env
file in the project root and add database credentials:Open thetouch .env
.env
file and add the following lines:DB_NAME=<database_name> DB_USER=<database_username> DB_PASSWORD=<database_password> DB_HOST=<database_host> DB_PORT=<database_port>
To run the accession_number_validation.py
script, use the following command:
python accession_number_validation.py
This section outlines the steps and SQL queries used to clean and refine the dataset by removing duplicate, non-citation, clinical trial, orphaned records and invalid citations. The aim is to ensure the integrity and relevance of the dataset by retaining only the most pertinent and accurate data points.
- Total Initial Records: 10,006,058
To count the initial records:
select count(*) from public.assertions;
- Duplicates Removed: 4,110,019
- Total Records After Removal: 5,896,039
Query to delete duplicates:
BEGIN;
DROP TABLE IF EXISTS assertions_affiliations_temp;
CREATE TEMPORARY TABLE assertions_affiliations_temp AS
SELECT * FROM assertions_affiliations;
DROP TABLE IF EXISTS assertions_funders_temp;
CREATE TEMPORARY TABLE assertions_funders_temp AS
SELECT * FROM assertions_funders;
DROP TABLE IF EXISTS assertions_subjects_temp;
CREATE TEMPORARY TABLE assertions_subjects_temp AS
SELECT * FROM assertions_subjects;
DROP TABLE IF EXISTS ranked_assertions;
CREATE TEMPORARY TABLE ranked_assertions AS
SELECT *,
ROW_NUMBER() OVER (PARTITION BY obj_id, subj_id, repository_id, publisher_id, journal_id, accession_number, source_id ORDER BY updated DESC) AS rn
FROM assertions;
TRUNCATE TABLE assertions CASCADE;
INSERT INTO assertions (id, type, created, updated, activity_id, repository_id, publisher_id, journal_id, source_type, title, obj_id, subj_id, published_date, accession_number, doi, relation_type_id, source_id, not_found, retried)
SELECT id, type, created, updated, activity_id, repository_id, publisher_id, journal_id, source_type, title, obj_id, subj_id, published_date, accession_number, doi, relation_type_id, source_id, not_found, retried
FROM ranked_assertions
WHERE rn = 1;
INSERT INTO assertions_affiliations
SELECT *
FROM assertions_affiliations_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_subjects
SELECT *
FROM assertions_subjects_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_funders
SELECT *
FROM assertions_funders_temp
WHERE assertion_id IN (SELECT id FROM assertions);
DROP TABLE ranked_assertions;
DROP TABLE assertions_affiliations_temp;
DROP TABLE assertions_funders_temp;
DROP TABLE assertions_subjects_temp;
COMMIT;
- Non-Citation Records Removed: 273,567
- Total Records After Removal: 5,622,472
Query to delete non-citation records:
BEGIN;
DROP TABLE IF EXISTS assertions_affiliations_temp;
CREATE TEMPORARY TABLE assertions_affiliations_temp AS
SELECT * FROM assertions_affiliations;
DROP TABLE IF EXISTS assertions_funders_temp;
CREATE TEMPORARY TABLE assertions_funders_temp AS
SELECT * FROM assertions_funders;
DROP TABLE IF EXISTS assertions_subjects_temp;
CREATE TEMPORARY TABLE assertions_subjects_temp AS
SELECT * FROM assertions_subjects;
DROP TABLE IF EXISTS citation_relation_types_assertions;
CREATE TEMPORARY TABLE citation_relation_types_assertions AS
SELECT *
FROM public.assertions
WHERE relation_type_id is null
OR relation_type_id in (
'cites',
'is-cited-by',
'references',
'is-referenced-by',
'is-supplemented-by',
'is-supplement-to'
);
TRUNCATE TABLE assertions CASCADE;
INSERT INTO assertions
SELECT *
FROM citation_relation_types_assertions;
INSERT INTO assertions_affiliations
SELECT *
FROM assertions_affiliations_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_subjects
SELECT *
FROM assertions_subjects_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_funders
SELECT *
FROM assertions_funders_temp
WHERE assertion_id IN (SELECT id FROM assertions);
DROP TABLE citation_relation_types_assertions;
DROP TABLE assertions_affiliations_temp;
DROP TABLE assertions_funders_temp;
DROP TABLE assertions_subjects_temp;
COMMIT;
- Clinical Trials Records Removed: 28,334
- Total Records After Removal: 5,594,138
Query to delete clinical trials data:
BEGIN;
DROP TABLE IF EXISTS assertions_affiliations_temp;
CREATE TEMPORARY TABLE assertions_affiliations_temp AS
SELECT * FROM assertions_affiliations;
DROP TABLE IF EXISTS assertions_funders_temp;
CREATE TEMPORARY TABLE assertions_funders_temp AS
SELECT * FROM assertions_funders;
DROP TABLE IF EXISTS assertions_subjects_temp;
CREATE TEMPORARY TABLE assertions_subjects_temp AS
SELECT * FROM assertions_subjects;
DROP TABLE IF EXISTS non_clinical_trial_assertions;
CREATE TEMPORARY TABLE non_clinical_trial_assertions AS
SELECT *
FROM assertions
WHERE repository_id NOT IN ('fef75a3c-6e48-4170-be9d-415601efb689', '2638e611-ff6f-49db-9b3e-702ecd16176b') OR repository_id IS NULL;
TRUNCATE TABLE assertions CASCADE;
INSERT INTO assertions
SELECT *
FROM non_clinical_trial_assertions;
INSERT INTO assertions_affiliations
SELECT *
FROM assertions_affiliations_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_subjects
SELECT *
FROM assertions_subjects_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_funders
SELECT *
FROM assertions_funders_temp
WHERE assertion_id IN (SELECT id FROM assertions);
DROP TABLE non_clinical_trial_assertions;
DROP TABLE assertions_affiliations_temp;
DROP TABLE assertions_funders_temp;
DROP TABLE assertions_subjects_temp;
COMMIT;
- Funders Removed (without related assertions): 1,003
- Subjects Removed (without related assertions): 0
- Affiliations Removed (without related assertions): 6,199
Query to delete orphaned records:
BEGIN;
DELETE FROM funders
WHERE id NOT IN (
SELECT funder_id FROM assertions_funders
);
DELETE FROM subjects
WHERE id NOT IN (
SELECT subject_id FROM assertions_subjects
);
DELETE FROM affiliations
WHERE id NOT IN (
SELECT affiliation_id FROM assertions_affiliations
);
COMMIT;
- Records Removed: 44,097
- Total Records After Removal: 5,550,041
Query to delete invalid citations:
BEGIN;
DROP TABLE IF EXISTS assertions_affiliations_temp;
CREATE TEMPORARY TABLE assertions_affiliations_temp AS
SELECT * FROM assertions_affiliations;
DROP TABLE IF EXISTS assertions_funders_temp;
CREATE TEMPORARY TABLE assertions_funders_temp AS
SELECT * FROM assertions_funders;
DROP TABLE IF EXISTS assertions_subjects_temp;
CREATE TEMPORARY TABLE assertions_subjects_temp AS
SELECT * FROM assertions_subjects;
DROP TABLE IF EXISTS subj_id_obj_id;
CREATE TEMPORARY TABLE subj_id_obj_id AS
SELECT *
FROM assertions
WHERE subj_id <> obj_id;
TRUNCATE TABLE assertions CASCADE;
INSERT INTO assertions
SELECT *
FROM subj_id_obj_id;
INSERT INTO assertions_affiliations
SELECT *
FROM assertions_affiliations_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_subjects
SELECT *
FROM assertions_subjects_temp
WHERE assertion_id IN (SELECT id FROM assertions);
INSERT INTO assertions_funders
SELECT *
FROM assertions_funders_temp
WHERE assertion_id IN (SELECT id FROM assertions);
DROP TABLE subj_id_obj_id;
DROP TABLE assertions_affiliations_temp;
DROP TABLE assertions_funders_temp;
DROP TABLE assertions_subjects_temp;
COMMIT;
- Initial Total Records: 10,006,058
- Duplicates Removed: 4,110,019
- Non-Citation Records Removed: 273,567
- Clinical Trials Records Removed: 28,334
- Funders Removed (without related assertions): 1,003
- Subjects Removed (without related assertions): 0
- Affiliations Removed (without related assertions): 6,199
- Invalid Citations Removed: 44,097
- Final Total Records: 5,550,041