DDMAL · candlecao · Oct 25, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/musicbrainz/csv/convert_to_csv.py b/musicbrainz/csv/convert_to_csv.py
@@ -33,9 +33,7 @@
 header = [f"{entity_type}_id"]
 values = []
 
-# the file must be from MusicBrainz's JSON data dumps.
-with open(inputpath, "r", encoding="utf-8") as f:
-    json_data = [json.loads(m) for m in f]
+IGNORE_COLUMN = ["alias", "tags", "sort-name", "disambiguation", "annotation"]
 
 
 def extract(data, value: dict, first_level: bool = True, key: str = ""):
@@ -52,9 +50,10 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
     if key != "":
         first_level = False
 
-    if "aliases" in key or "tags" in key or "sort-name" in key:
-        # ignore aliases, tags, and sort-name to make output simplier
-        return
+    for i in IGNORE_COLUMN:
+        if i in key:
+            # ignore aliases, tags, and sort-name to make output simplier
+            return
 
     if isinstance(data, dict):
         # the input JSON Lines format is lines of dictionaries, and the input data should be
@@ -79,7 +78,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
 
             # after extracting every entry of the current line, append it to the list and empty it.
             values.append(copy.deepcopy(value))
-            value = {}
+            value.clear()
 
         else:
             # if this dictionary is nested, then we do not extract all info,
@@ -101,12 +100,12 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
                         key + "_id",
                     )
 
-                if k == "name":
-                    extract(data["name"], value, first_level, key + "_name")
+                # if k == "name":
+                #     extract(data["name"], value, first_level, key + "_name")
 
                 if isinstance(data[k], dict) or isinstance(data[k], list):
                     # if there is still a nested instance, extract further
-                    if key.split('_')[-1] not in [
+                    if key.split("_")[-1] not in [
                         "area",
                         "artist",
                         "event",
@@ -115,6 +114,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
                         "recording",
                         "genres",
                     ]:
+                        # avoid extracting duplicate data
                         extract(data[k], value, first_level, key + "_" + k)
 
     elif isinstance(data, list):
@@ -152,7 +152,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""):
         return
 
 
-def convert_dict_to_csv(dictionary_list: list, filename: str) -> None:
+def convert_dict_to_csv(dictionary_list: list) -> None:
     """
     (list, str) -> None
     Writes a list of dictionaries into the given file.
@@ -163,40 +163,72 @@ def convert_dict_to_csv(dictionary_list: list, filename: str) -> None:
         dictionary_list: the list of dictionary that contains all the data
         filename: the destination filename
     """
-    with open(filename, mode="w", newline="", encoding="utf-8") as csv_file:
-        writer = csv.writer(csv_file)
-        writer.writerow(header)
-        # Find the maximum length of lists in the dictionary
-
-        for dictionary in dictionary_list:
-            max_length = max(
-                len(v) if isinstance(v, list) else 1 for v in dictionary.values()
-            )
-
-            for i in range(max_length):
-                row = [dictionary[f"{entity_type}_id"]]
-                for key in header:
-                    if key == f"{entity_type}_id":
-                        continue
-
-                    if key in dictionary:
-                        if isinstance(dictionary[key], list):
-                            # Append the i-th element of the list,
-                            # or an empty string if index is out of range
-                            row.append(
-                                (dictionary[key])[i] if i < len(dictionary[key]) else ""
-                            )
-                        else:
-                            # Append the single value
-                            # (for non-list entries, only on the first iteration)
-                            row.append(dictionary[key] if i == 0 else "")
+
+    # Find the maximum length of lists in the dictionary
+    for dictionary in dictionary_list:
+        max_length = max(
+            len(v) if isinstance(v, list) else 1 for v in dictionary.values()
+        )
+
+        for i in range(max_length):
+            row = [dictionary[f"{entity_type}_id"]]
+            for key in header:
+                if key == f"{entity_type}_id":
+                    continue
+
+                if key in dictionary:
+                    if isinstance(dictionary[key], list):
+                        # Append the i-th element of the list,
+                        # or an empty string if index is out of range
+                        row.append(
+                            (dictionary[key])[i] if i < len(dictionary[key]) else ""
+                        )
                     else:
-                        row.append("")
+                        # Append the single value
+                        # (for non-list entries, only on the first iteration)
+                        row.append(dictionary[key] if i == 0 else "")
+                else:
+                    row.append("")
+
+            with open(
+                "temp.csv", mode="a", newline="", encoding="utf-8"
+            ) as csv_records:
+                writer_records = csv.writer(csv_records)
+                writer_records.writerow(row)
 
-                writer.writerow(row)
 
+CHUNK_SIZE = 4096
 
 if __name__ == "__main__":
-    extract(json_data, {})
 
-    convert_dict_to_csv(values, outputpath)
+    # the file must be from MusicBrainz's JSON data dumps.
+    chunk = []
+
+    with open(inputpath, "r", encoding="utf-8") as f:
+        for line in f:
+            line_data = json.loads(line)  # Parse each line as a JSON object
+            chunk.append(line_data)  # Add the JSON object to the current chunk
+
+            # When the chunk reaches the desired size, process it
+            if len(chunk) == CHUNK_SIZE:
+                extract(chunk, {})
+                chunk.clear()  # Reset the chunk
+                convert_dict_to_csv(values)
+
+            values.clear()
+
+        # Process any remaining data in the last chunk
+        if chunk:
+            extract(chunk, {})
+            chunk.clear()
+            convert_dict_to_csv(values)
+
+    with open(outputpath, "w", encoding="utf-8") as f:
+        with open("temp.csv", "r", encoding="utf-8") as f_temp:
+            f.write(",".join(header))
+            f.write("\n")
+
+            for line in f_temp:
+                f.write(line)
+
+    os.remove("temp.csv")