DDMAL · Yueqiao12Zhang · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/csv2rdf/MusicBrainz/mapping.json b/csv2rdf/MusicBrainz/mapping.json
@@ -1,33 +1,84 @@
 {
-    "annotation": "http://www.wikidata.org/prop/direct/P2916",
+    "address": "http://www.wikidata.org/prop/direct/P6375",
     "area": "http://www.wikidata.org/prop/direct/P276",
     "area_id": "http://www.wikidata.org/prop/direct/P982",
-    "area_name": "http://www.wikidata.org/prop/direct/P276",
     "artist-credit_artist_id": "http://www.wikidata.org/prop/direct/P175",
-    "artist-credit_artist_name": "http://www.wikidata.org/prop/direct/P2561",
-    "artist-credit_name": "http://www.wikidata.org/prop/direct/P2561",
+    "asin": "http://www.wikidata.org/prop/direct/P5749",
+    "barcode": "http://www.wikidata.org/prop/direct/P3962",
+    "begin-area": "http://www.wikidata.org/prop/direct/P1427",
+    "begin-area_id": "http://www.wikidata.org/prop/direct/P1427",
+    "begin-area_iso-3166-1-codes": "http://www.wikidata.org/prop/direct/P299",
+    "begin-area_iso-3166-2-codes": "http://www.wikidata.org/prop/direct/P300",
+    "begin-area_iso-3166-3-codes": "http://www.wikidata.org/prop/direct/P773",
     "cancelled": "https://musicbrainz.org/doc/Event#cancelled",
+    "coordinates": "http://www.wikidata.org/prop/direct/P625",
     "country": "http://www.wikidata.org/prop/direct/P17",
+    "date": "http://www.wikidata.org/prop/direct/P585",
     "description": "http://schema.org/description",
-    "disambiguation": "http://schema.org/disambiguatingDescription",
+    "end-area": "http://www.wikidata.org/prop/direct/P1444",
+    "end-area_id": "http://www.wikidata.org/prop/direct/P1444",
+    "end-area_iso-3166-1-codes": "http://www.wikidata.org/prop/direct/P299",
+    "end-area_iso-3166-2-codes": "http://www.wikidata.org/prop/direct/P300",
+    "end-area_iso-3166-3-codes": "http://www.wikidata.org/prop/direct/P773",
     "entity_type": {
         "area.csv": "http://www.wikidata.org/entity/Q2221906",
+        "artist.csv": "http://www.wikidata.org/entity/Q483501",
         "event.csv": "http://www.wikidata.org/entity/Q1656682",
         "genre.csv": "http://www.wikidata.org/entity/Q188451",
         "instrument.csv": "http://www.wikidata.org/entity/Q34379",
         "label.csv": "http://www.wikidata.org/entity/Q18127",
-        "recording.csv": "http://www.wikidata.org/entity/Q15975575"
+        "place.csv": "http://www.wikidata.org/entity/Q2221906",
+        "recording.csv": "http://www.wikidata.org/entity/Q15975575",
+        "release-group.csv": "http://www.wikidata.org/entity/Q108346082",
+        "release.csv": "http://www.wikidata.org/entity/Q3972943",
+        "series.csv": "http://www.wikidata.org/entity/Q3511132",
+        "work.csv": "http://www.wikidata.org/entity/Q2188189"
     },
+    "first-release-date": "http://www.wikidata.org/prop/direct/P577",
+    "gender": "http://www.wikidata.org/prop/direct/P21",
+    "gender-id": "http://www.wikidata.org/prop/direct/P21",
     "genres_id": "http://www.wikidata.org/prop/direct/P136",
-    "genres_name": "https://schema.org/genre",
     "ipis": "http://www.wikidata.org/prop/direct/P3453",
     "isnis": "http://www.wikidata.org/prop/direct/P213",
+    "iso-3166-1-codes": "http://www.wikidata.org/prop/direct/P299",
+    "iso-3166-2-codes": "http://www.wikidata.org/prop/direct/P300",
+    "iso-3166-3-codes": "http://www.wikidata.org/prop/direct/P773",
     "isrcs": "http://www.wikidata.org/prop/direct/P1243",
+    "iswcs": "http://www.wikidata.org/prop/direct/P1827",
     "label-code": "http://www.wikidata.org/prop/direct/P7320",
+    "label-info_label_id": "http://www.wikidata.org/prop/direct/P264",
+    "language": "http://www.wikidata.org/prop/direct/P407",
+    "languages": "http://www.wikidata.org/prop/direct/P407",
     "length": "http://www.wikidata.org/prop/direct/P2047",
+    "media_data-tracks_artist-credit_artist_id": "http://www.wikidata.org/prop/direct/P697",
+    "media_data-tracks_id": "http://www.wikidata.org/prop/direct/P2550",
+    "media_data-tracks_recording_id": "http://www.wikidata.org/prop/direct/P2550",
+    "media_discs_id": "https://musicbrainz.org/disc",
+    "media_discs_offsets": "http://www.wikidata.org/prop/direct/P4153",
+    "media_pregap_artist-credit_artist_id": "http://www.wikidata.org/prop/direct/P697",
+    "media_pregap_id": "https://musicbrainz.org/pregap",
+    "media_pregap_recording_id": "http://www.wikidata.org/prop/direct/P2550",
+    "media_tracks_artist-credit_artist_id": "http://www.wikidata.org/prop/direct/P697",
+    "media_tracks_id": "http://www.wikidata.org/prop/direct/P9831",
+    "media_tracks_recording_id": "http://www.wikidata.org/prop/direct/P2550",
     "name": "http://www.wikidata.org/prop/direct/P2561",
+    "packaging": "https://musicbrainz.org/packaging",
+    "packaging-id": "https://musicbrainz.org/packaging",
+    "primary-type": "http://www.wikidata.org/prop/direct/P2308",
+    "primary-type-id": "http://www.wikidata.org/prop/direct/P2308",
+    "quality": "http://www.wikidata.org/prop/direct/P1552",
     "relations_wiki": "http://www.wikidata.org/prop/direct/P2888",
+    "release-events_area_id": "http://www.wikidata.org/prop/direct/P1427",
+    "release-group_artist-credit_artist_id": "http://www.wikidata.org/prop/direct/P697",
+    "release-group_genres_id": "http://www.wikidata.org/prop/direct/P136",
+    "release-group_id": "http://www.wikidata.org/prop/direct/P9831",
+    "release-group_secondary-type-ids": "http://www.wikidata.org/prop/direct/P2308",
+    "release-group_secondary-types": "http://www.wikidata.org/prop/direct/P2308",
+    "secondary-type-ids": "http://www.wikidata.org/prop/direct/P2308",
+    "secondary-types": "http://www.wikidata.org/prop/direct/P2308",
     "setlist": "http://www.wikidata.org/prop/direct/P9793",
+    "status": "http://www.wikidata.org/prop/direct/P6216",
+    "status-id": "http://www.wikidata.org/prop/direct/P6216",
     "time": "http://www.wikidata.org/prop/direct/P585",
     "title": "http://www.wikidata.org/prop/direct/P1476",
     "type": "http://www.wikidata.org/prop/direct/P2308",

diff --git a/csv2rdf/csv2rdf_single_subject.py b/csv2rdf/csv2rdf_single_subject.py
@@ -82,7 +82,7 @@ def convert_csv_to_turtle(filenames: List[str]) -> Graph:
                     else:
                         if element == "True" or element == "False":
                             obj = Literal(element, datatype=XSD.boolean)
-                        elif element.isnumeric():
+                        elif element.isdigit():
                             obj = Literal(element, datatype=XSD.integer)
                         else:
                             obj = Literal(element)

diff --git a/musicbrainz/README.md b/musicbrainz/README.md
@@ -1,27 +1,68 @@
-#   1: The procedure:
-*   Since all ids and Wikidata links are already reconciled in the conversion process, there's no need to turn to OpenRefine.
--   Steps:
-1.  Navigate to ```linkedmusic-datalake/musicbrainz/csv``` folder.
-1.  Run ```python3 fetch.py``` to get the latest tar.xz files from the MusicBrainz public data dumps into the local ```data/raw/``` folder.
-2.  Run ```python3 untar.py``` to unzip the files and extract the jsonl files needed into the local ```data/raw/extracted_jsonl/mbdump/``` folder.
-2.  Run convert_to_csv.py, specify the JSON file in the first argument and the entity type in the second argument.
-    *   Example command line: 
-        ```python3 convert_to_csv.py data/raw/extracted_jsonl/mbdump/area area```
-3.  A CSV file named by its entity type will be generated in ```data/output/``` folder. It can be used for further operations.
-
-#   2: The data details:
--   In the link provided as below(It provides 2 versions, we usually choose the latest such as 20240626-001001/), we can download archived files which end with suffix ".tar.xz". If we unzip any of them, we will see a "mbdump" folder with a file named by its entity type and without extension. This is the dump in "JSON Lines" format. Each line represents one record in the dump. 
--   The name of the file is just the type of the entity(the class of an instance) in the database. For example, there are types such as area, artist, event, instrument, label, place etc.
--   In every line, there must be an attribute named "id", which is the primary key of each record. When converted into CSV, we rename the id according to "{entity_type}_id" format to be more precise of which entity type we are working with.
--   During the conversion process, for all ids of different entity type (genre_id, artist_id, area_id, etc.), we add the MusicBrainz reference to the id in the format: "https://musicbrainz.org/{entity_type}/{id}". It automatically converts the id to a URI reference.
--   For any record, if it is reconciled with Wikidata link by MusicBrainz bots, then it should have an object in "relations" > "resources" > "url", with the Wikidata link as the value. If it exists, then it is extracted to the CSV file.
-
-#   DEPRECATED
-#   3: As an experiment data sets:
--   For experiment purposes, you had better only use a small portion of each data dump:
--   So, use the command of bash(for example, extract 3000 entries of an entity), please find the "mbdump" folder and open the terminal at the folder, then exectute:
-        head -n 3000 "area">"test_area"
-    to get the first 3000 lines from the area data dumps.
--   All other data dumps perform the same procedure.
+## 1: Procedure
 
+### Prerequisites:
+- All IDs and Wikidata links are already reconciled during the conversion process, eliminating the need for OpenRefine.
 
+### Steps:
+1. **Navigate to the target folder:**
+   - Go to the `linkedmusic-datalake/musicbrainz/csv` directory.
+
+2. **Fetch the latest data:**
+   - Run the following command to download the latest tar.xz files from the MusicBrainz public data dumps:
+     ```bash
+     python3 fetch.py
+     ```
+   - The files will be saved in the local `data/raw/` folder.
+
+3. **Extract the required files:**
+   - Unzip and extract the necessary JSON Lines (jsonl) files by running:
+     ```bash
+     python3 untar.py
+     ```
+   - The extracted files will be located in the `data/raw/extracted_jsonl/mbdump/` folder.
+
+4. **Convert data to CSV:**
+   - Execute the conversion script:
+     ```bash
+     python3 convert_to_csv.py
+     ```
+   - This will generate a CSV file, named according to its entity type, in the `data/output/` folder.
+
+5. **Output:**
+   - The generated CSV files are ready for further processing.
+
+---
+
+## 2: Data Details
+
+### Overview:
+- The data can be downloaded from the provided link, typically selecting the latest version (e.g., `20240626-001001/`).
+- The downloaded `.tar.xz` files contain a `mbdump` folder with files named by entity type (e.g., `area`, `artist`, `event`, `instrument`, `label`, `place`). Each file is in "JSON Lines" format, with each line representing a single record.
+
+### Important Notes:
+- **ID Attributes:** Each record has an `id` attribute, which serves as the primary key. When converting to CSV, this `id` is renamed to `{entity_type}_id` for clarity.
+- **URI Conversion:** All IDs (e.g., `genre_id`, `artist_id`, `area_id`) are converted to URIs in the format: `https://musicbrainz.org/{entity_type}/{id}`.
+- **Wikidata Links:** If a record is linked to a Wikidata entry by MusicBrainz bots, the link can be found under `"relations" > "resources" > "url"`. These are also extracted into the CSV.
+
+---
+
+## 3: Mapping
+
+### Custom Predicate URLs:
+- The following made-up predicate URLs are used in the data conversion:
+  - `"packaging"`: `https://musicbrainz.org/packaging`
+  - `"packaging-id"`: `https://musicbrainz.org/packaging`
+  - `"media_pregap_id"`: `https://musicbrainz.org/pregap`
+  - `"media_discs_id"`: `https://musicbrainz.org/disc`
+
+---
+
+## Deprecated: Experiment Data Sets
+
+### Experiment Guidelines:
+- For experimental purposes, it is recommended to use a small portion of each data dump.
+- Use the following bash command to extract the first 3000 entries of a specific entity (e.g., `area`):
+  ```bash
+  head -n 3000 "area" > "test_area"
+  ```
+- Apply the same process to other data dumps if needed.
diff --git a/musicbrainz/csv/convert_to_csv.py b/musicbrainz/csv/convert_to_csv.py
@@ -19,24 +19,24 @@
 import copy
 import csv
 import os
+import glob
 import sys
 
 DIRNAME = os.path.dirname(__file__)
 
-if len(sys.argv) != 3:
+if len(sys.argv) != 1:
     raise ValueError("Invalid number of arguments")
 
-entity_type = sys.argv[2]
-inputpath = os.path.join(DIRNAME, sys.argv[1])
-outputpath = os.path.join(DIRNAME, "../data/output", f"{entity_type}.csv")
+inputpath = os.path.relpath("../data/raw/extracted_jsonl/mbdump")
+outputpath = os.path.relpath("../data/output")
 
-header = [f"{entity_type}_id"]
+IGNORE_COLUMN = {"alias", "tags", "sort-name", "disambiguation", "annotation"}
+CHUNK_SIZE = 4096
+# 4096 was chosen because ChatGPT and StackOverflow examples typically use 4096 or 8192.
+entity_type = ""
+header = []
 values = []
 
-# the file must be from MusicBrainz's JSON data dumps.
-with open(inputpath, "r", encoding="utf-8") as f:
-    json_data = [json.loads(m) for m in f]
-
 
 def extract(data, value: dict, first_level: bool = True, key: str = ""):
     """
@@ -52,9 +52,10 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
     if key != "":
         first_level = False
 
-    if "aliases" in key or "tags" in key or "sort-name" in key:
-        # ignore aliases, tags, and sort-name to make output simplier
-        return
+    for i in IGNORE_COLUMN:
+        if i in key:
+            # ignore aliases, tags, and sort-name to make output simplier
+            return
 
     if isinstance(data, dict):
         # the input JSON Lines format is lines of dictionaries, and the input data should be
@@ -79,7 +80,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
 
             # after extracting every entry of the current line, append it to the list and empty it.
             values.append(copy.deepcopy(value))
-            value = {}
+            value.clear()
 
         else:
             # if this dictionary is nested, then we do not extract all info,
@@ -101,20 +102,21 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
                         key + "_id",
                     )
 
-                if k == "name":
-                    extract(data["name"], value, first_level, key + "_name")
-
                 if isinstance(data[k], dict) or isinstance(data[k], list):
                     # if there is still a nested instance, extract further
-                    if key.split('_')[-1] not in [
+                    if key.split("_")[-1] not in {
                         "area",
                         "artist",
                         "event",
                         "instrument",
                         "label",
                         "recording",
                         "genres",
-                    ]:
+                        "iso-3166-1-codes",
+                        "iso-3166-2-codes",
+                        "iso-3166-3-codes",
+                    }:
+                        # avoid extracting duplicate data
                         extract(data[k], value, first_level, key + "_" + k)
 
     elif isinstance(data, list):
@@ -152,9 +154,9 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
         return
 
 
-def convert_dict_to_csv(dictionary_list: list, filename: str) -> None:
+def convert_dict_to_csv(dictionary_list: list) -> None:
     """
-    (list, str) -> None
+    (list) -> None
     Writes a list of dictionaries into the given file.
     If there are multiple values against a single key, a new column with only the
     id and that value is created.
@@ -163,40 +165,74 @@ def convert_dict_to_csv(dictionary_list: list, filename: str) -> None:
         dictionary_list: the list of dictionary that contains all the data
         filename: the destination filename
     """
-    with open(filename, mode="w", newline="", encoding="utf-8") as csv_file:
-        writer = csv.writer(csv_file)
-        writer.writerow(header)
-        # Find the maximum length of lists in the dictionary
-
-        for dictionary in dictionary_list:
-            max_length = max(
-                len(v) if isinstance(v, list) else 1 for v in dictionary.values()
-            )
-
-            for i in range(max_length):
-                row = [dictionary[f"{entity_type}_id"]]
-                for key in header:
-                    if key == f"{entity_type}_id":
-                        continue
-
-                    if key in dictionary:
-                        if isinstance(dictionary[key], list):
-                            # Append the i-th element of the list,
-                            # or an empty string if index is out of range
-                            row.append(
-                                (dictionary[key])[i] if i < len(dictionary[key]) else ""
-                            )
-                        else:
-                            # Append the single value
-                            # (for non-list entries, only on the first iteration)
-                            row.append(dictionary[key] if i == 0 else "")
+
+    # Find the maximum length of lists in the dictionary
+    for dictionary in dictionary_list:
+        max_length = max(
+            len(v) if isinstance(v, list) else 1 for v in dictionary.values()
+        )
+
+        for i in range(max_length):
+            row = [dictionary[f"{entity_type}_id"]]
+            for key in header:
+                if key == f"{entity_type}_id":
+                    continue
+
+                if key in dictionary:
+                    if isinstance(dictionary[key], list):
+                        # Append the i-th element of the list,
+                        # or an empty string if index is out of range
+                        row.append(
+                            (dictionary[key])[i] if i < len(dictionary[key]) else ""
+                        )
                     else:
-                        row.append("")
+                        # Append the single value
+                        # (for non-list entries, only on the first iteration)
+                        row.append(dictionary[key] if i == 0 else "")
+                else:
+                    row.append("")
 
-                writer.writerow(row)
+            with open(
+                "temp.csv", mode="a", newline="", encoding="utf-8"
+            ) as csv_records:
+                writer_records = csv.writer(csv_records, delimiter="\t")
+                writer_records.writerow(row)
 
 
 if __name__ == "__main__":
-    extract(json_data, {})
 
-    convert_dict_to_csv(values, outputpath)
+    for file in glob.glob(f"{inputpath}/*"):
+        # the file must be from MusicBrainz's JSON data dumps.
+        entity_type = file.split("/")[-1]
+        header = [f"{entity_type}_id"]
+        values = []
+        chunk = []
+
+        with open(file, "r", encoding="utf-8") as f:
+            for line in f:
+                line_data = json.loads(line)  # Parse each line as a JSON object
+                chunk.append(line_data)  # Add the JSON object to the current chunk
+
+                # When the chunk reaches the desired size, process it
+                if len(chunk) == CHUNK_SIZE:
+                    extract(chunk, {})
+                    chunk.clear()  # Reset the chunk
+                    convert_dict_to_csv(values)
+
+                values.clear()
+
+            # Process any remaining data in the last chunk
+            if chunk:
+                extract(chunk, {})
+                chunk.clear()
+                convert_dict_to_csv(values)
+
+        with open(os.path.join(outputpath, entity_type + ".csv"), "w", encoding="utf-8") as f:
+            with open("temp.csv", "r", encoding="utf-8") as f_temp:
+                writer = csv.writer(f)
+                writer.writerow(header)
+
+                for line in f_temp.readlines():
+                    writer.writerow(line.strip().split("\t"))
+
+        os.remove("temp.csv")