-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Musicbrainz reduce memory used by processing by chunks #165
Changes from 21 commits
aca1887
110263b
f9d2091
f72eb1e
48adb26
87d9fe5
6659d28
e79de48
6c565be
088c40d
9ff292f
2cbea9b
7ae3914
04b3c41
8203c71
6e51f96
e23ed58
740d365
c0d1c78
6c6f920
99ff276
e757df8
805c464
81f34e7
92bfaf1
581e86a
6422088
89371de
e93147e
8dc8814
397e95e
0759de7
10f62c2
738a7ed
ceffbf1
66133e8
7cb07d0
3f61a23
5a1783b
d47751e
3bad21e
6231da5
29dc943
28243a5
37201ab
72b92e6
b64da8a
241dea3
956f924
3146891
b58e618
d6e48bf
23a6861
222e715
dcaf73e
63c8914
8d4ef18
ed26803
622cefe
8e99189
84e4b1b
798d448
13e8601
6c6401e
d380d46
c4be2a5
9fe4df5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,9 +33,7 @@ | |
header = [f"{entity_type}_id"] | ||
values = [] | ||
|
||
# the file must be from MusicBrainz's JSON data dumps. | ||
with open(inputpath, "r", encoding="utf-8") as f: | ||
json_data = [json.loads(m) for m in f] | ||
IGNORE_COLUMN = ["alias", "tags", "sort-name", "disambiguation", "annotation"] | ||
|
||
|
||
def extract(data, value: dict, first_level: bool = True, key: str = ""): | ||
|
@@ -52,9 +50,10 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""): | |
if key != "": | ||
first_level = False | ||
|
||
if "aliases" in key or "tags" in key or "sort-name" in key: | ||
# ignore aliases, tags, and sort-name to make output simplier | ||
return | ||
for i in IGNORE_COLUMN: | ||
if i in key: | ||
# ignore aliases, tags, and sort-name to make output simplier | ||
return | ||
|
||
if isinstance(data, dict): | ||
# the input JSON Lines format is lines of dictionaries, and the input data should be | ||
|
@@ -79,7 +78,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""): | |
|
||
# after extracting every entry of the current line, append it to the list and empty it. | ||
values.append(copy.deepcopy(value)) | ||
value = {} | ||
value.clear() | ||
|
||
else: | ||
# if this dictionary is nested, then we do not extract all info, | ||
|
@@ -101,12 +100,12 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""): | |
key + "_id", | ||
) | ||
|
||
if k == "name": | ||
extract(data["name"], value, first_level, key + "_name") | ||
# if k == "name": | ||
# extract(data["name"], value, first_level, key + "_name") | ||
Yueqiao12Zhang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if isinstance(data[k], dict) or isinstance(data[k], list): | ||
# if there is still a nested instance, extract further | ||
if key.split('_')[-1] not in [ | ||
if key.split("_")[-1] not in [ | ||
Yueqiao12Zhang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"area", | ||
"artist", | ||
"event", | ||
|
@@ -115,6 +114,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""): | |
"recording", | ||
"genres", | ||
]: | ||
# avoid extracting duplicate data | ||
extract(data[k], value, first_level, key + "_" + k) | ||
|
||
elif isinstance(data, list): | ||
|
@@ -152,7 +152,7 @@ def extract(data, value: dict, first_level: bool = True, key: str = ""): | |
return | ||
|
||
|
||
def convert_dict_to_csv(dictionary_list: list, filename: str) -> None: | ||
def convert_dict_to_csv(dictionary_list: list) -> None: | ||
""" | ||
(list, str) -> None | ||
Yueqiao12Zhang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Writes a list of dictionaries into the given file. | ||
|
@@ -163,40 +163,72 @@ def convert_dict_to_csv(dictionary_list: list, filename: str) -> None: | |
dictionary_list: the list of dictionary that contains all the data | ||
filename: the destination filename | ||
""" | ||
with open(filename, mode="w", newline="", encoding="utf-8") as csv_file: | ||
writer = csv.writer(csv_file) | ||
writer.writerow(header) | ||
# Find the maximum length of lists in the dictionary | ||
|
||
for dictionary in dictionary_list: | ||
max_length = max( | ||
len(v) if isinstance(v, list) else 1 for v in dictionary.values() | ||
) | ||
|
||
for i in range(max_length): | ||
row = [dictionary[f"{entity_type}_id"]] | ||
for key in header: | ||
if key == f"{entity_type}_id": | ||
continue | ||
|
||
if key in dictionary: | ||
if isinstance(dictionary[key], list): | ||
# Append the i-th element of the list, | ||
# or an empty string if index is out of range | ||
row.append( | ||
(dictionary[key])[i] if i < len(dictionary[key]) else "" | ||
) | ||
else: | ||
# Append the single value | ||
# (for non-list entries, only on the first iteration) | ||
row.append(dictionary[key] if i == 0 else "") | ||
|
||
# Find the maximum length of lists in the dictionary | ||
for dictionary in dictionary_list: | ||
max_length = max( | ||
len(v) if isinstance(v, list) else 1 for v in dictionary.values() | ||
) | ||
|
||
for i in range(max_length): | ||
row = [dictionary[f"{entity_type}_id"]] | ||
for key in header: | ||
if key == f"{entity_type}_id": | ||
continue | ||
|
||
if key in dictionary: | ||
if isinstance(dictionary[key], list): | ||
# Append the i-th element of the list, | ||
# or an empty string if index is out of range | ||
row.append( | ||
(dictionary[key])[i] if i < len(dictionary[key]) else "" | ||
) | ||
else: | ||
row.append("") | ||
# Append the single value | ||
# (for non-list entries, only on the first iteration) | ||
row.append(dictionary[key] if i == 0 else "") | ||
else: | ||
row.append("") | ||
|
||
with open( | ||
"temp.csv", mode="a", newline="", encoding="utf-8" | ||
) as csv_records: | ||
writer_records = csv.writer(csv_records) | ||
writer_records.writerow(row) | ||
|
||
writer.writerow(row) | ||
|
||
CHUNK_SIZE = 4096 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this go at the top with the other configuration constant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also add a comment on why you chose 4096? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't know exactly, I think this could be any number not too large, but GPT and stack overflow all use something like 4096 and 8192, so I decided to use 4096. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So, comment that in the code: "4096 was chosen because ChatGPT and StackOverflow examples typically use 4096 or 8192." |
||
|
||
if __name__ == "__main__": | ||
extract(json_data, {}) | ||
|
||
convert_dict_to_csv(values, outputpath) | ||
# the file must be from MusicBrainz's JSON data dumps. | ||
chunk = [] | ||
|
||
with open(inputpath, "r", encoding="utf-8") as f: | ||
for line in f: | ||
line_data = json.loads(line) # Parse each line as a JSON object | ||
chunk.append(line_data) # Add the JSON object to the current chunk | ||
|
||
# When the chunk reaches the desired size, process it | ||
if len(chunk) == CHUNK_SIZE: | ||
extract(chunk, {}) | ||
chunk.clear() # Reset the chunk | ||
convert_dict_to_csv(values) | ||
|
||
values.clear() | ||
|
||
# Process any remaining data in the last chunk | ||
if chunk: | ||
extract(chunk, {}) | ||
chunk.clear() | ||
convert_dict_to_csv(values) | ||
|
||
with open(outputpath, "w", encoding="utf-8") as f: | ||
with open("temp.csv", "r", encoding="utf-8") as f_temp: | ||
f.write(",".join(header)) | ||
f.write("\n") | ||
|
||
for line in f_temp: | ||
f.write(line) | ||
Yueqiao12Zhang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
os.remove("temp.csv") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There’s no need to loop here. Make ignore_column a set, and then just do “if key in ignore_column”. Since it’s a set the lookup is O(1).
probably not a huge difference here but it’s good to build these sorts of optimizations into your normal repertoire.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The key can be longer and only contains the ignored string. We don't want it.