From 3452072c7b77d13fb159928946daab6664297734 Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Sun, 30 Jun 2024 14:21:50 -0700 Subject: [PATCH] cleanup script args --- dataengine/.gitignore | 1 + dataengine/README.md | 28 +++++++++++++++++------ dataengine/clean_and_rename_files.py | 12 +++++++--- dataengine/generate_qa.py | 24 ++++++++++---------- dataengine/generate_topics.py | 4 ++-- dataengine/generate_vqa.py | 30 ++++++++++++------------- dataengine/process_json_files.py | 10 +++++++-- dataengine/wikiflow.py | 33 ++++++++++++---------------- 8 files changed, 82 insertions(+), 60 deletions(-) create mode 100644 dataengine/.gitignore diff --git a/dataengine/.gitignore b/dataengine/.gitignore new file mode 100644 index 0000000..adbb97d --- /dev/null +++ b/dataengine/.gitignore @@ -0,0 +1 @@ +data/ \ No newline at end of file diff --git a/dataengine/README.md b/dataengine/README.md index 682ff3c..5d07b9d 100644 --- a/dataengine/README.md +++ b/dataengine/README.md @@ -10,14 +10,28 @@ OPENAI_API_KEY="your_openai_key" GOOGLE_API_KEY="your_google_api_key" GOOGLE_SE_ID="your_google_search_engine_id" USER_AGENT="your_user_agent" -WIKIPEDIA_USER_AGENT="/ ()" # https://foundation.wikimedia.org/wiki/Policy:User-Agent_policy +# https://foundation.wikimedia.org/wiki/Policy:User-Agent_policy +WIKIPEDIA_USER_AGENT="/ ()" -python generate_topics.py && -python process_json_files.py && -python clean_and_rename_files.py && -python wikiflow.py && -python generate_qa.py && -python generate_vqa.py +# set args for the scripts +DATA_DIR="./data/" +IN_FILE="${DATA_DIR}/input_fields_subfields.txt" +TOPICS_DIR="${DATA_DIR}/topics/" + +WIKI_DIR="${DATA_DIR}/wikidata/" +WIKI_LINKS_DIR="${WIKI_DIR}/wikilinks/" +WIKI_DATA_DIR="${WIKI_DIR}/data/" + +IMAGE_DIR="${DATA_DIR}/images/" +QA_DIR="${DATA_DIR}/qadata/" +VQA_DIR="${DATA_DIR}/vqa/" + +python generate_topics.py --data_file_path $IN_FILE --output_dir $TOPICS_DIR +python process_json_files.py --topics_dir $TOPICS_DIR +python clean_and_rename_files.py --topics_dir $TOPICS_DIR +python wikiflow.py --topics_dir $TOPICS_DIR --links_dir $WIKI_LINKS_DIR --data_dir $WIKI_DATA_DIR +python generate_qa.py --topics_dir $TOPICS_DIR --data_dir $WIKI_DATA_DIR --output_dir $QA_DIR --image_dir $IMAGE_DIR +python generate_vqa.py --image_dir $IMAGE_DIR --qa_dir $QA_DIR --vqa_dir $VQA_DIR ``` ## Explanation diff --git a/dataengine/clean_and_rename_files.py b/dataengine/clean_and_rename_files.py index fe0d9d1..ca2df79 100644 --- a/dataengine/clean_and_rename_files.py +++ b/dataengine/clean_and_rename_files.py @@ -21,6 +21,12 @@ def rename_files(directory): if __name__ == '__main__': - directory_path = 'topics' - remove_non_post_files(directory_path) - rename_files(directory_path) + import argparse + parser = argparse.ArgumentParser( + description='Clean and rename topics JSON files in a directory') + parser.add_argument('--topics_dir', type=str, default='./data/topics/', + help='Directory of topics JSON files to process') + args = parser.parse_args() + + remove_non_post_files(args.topics_dir) + rename_files(args.topics_dir) diff --git a/dataengine/generate_qa.py b/dataengine/generate_qa.py index 37f8dc2..736b9ba 100644 --- a/dataengine/generate_qa.py +++ b/dataengine/generate_qa.py @@ -184,7 +184,7 @@ def worker(data_chunk, proc_index, result_list, image_directory): result_list.append(result) -def main(fields, topics_directory, data_path, image_directory, output_directory): +def main(fields, topics_directory, data_dir, image_directory, output_directory): os.makedirs(image_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) for field in fields: @@ -192,7 +192,7 @@ def main(fields, topics_directory, data_path, image_directory, output_directory) os.makedirs(image_directory, exist_ok=True) id = len(glob.glob(os.path.join(image_directory, '*'))) print(f'starting images with id {id}') - topic_files = get_topic_files(data_path, field, topics_directory) + topic_files = get_topic_files(data_dir, field, topics_directory) filtered_data = get_data(topic_files) processed_data_file_path = f'{output_directory}/{field}.json' print(processed_data_file_path) @@ -226,15 +226,15 @@ def main(fields, topics_directory, data_path, image_directory, output_directory) parser = argparse.ArgumentParser(description='Process some topics.') parser.add_argument('--fields', nargs='+', default=[ 'Renewable_Energy_and_Sustainability', 'Geology_and_Earth_Sciences'], help='List of fields files') - parser.add_argument('--topics_directory', type=str, - default='topics', help='Directory of topics') - parser.add_argument('--data_path', type=str, - default='wikidata/data/', help='Data path') - parser.add_argument('--image_directory', type=str, - default='images', help='Directory to store images') - parser.add_argument('--output_directory', type=str, - default='qadata', help='Directory to store q&a processed') + parser.add_argument('--topics_dir', type=str, + default='./data/topics', help='Directory of topics') + parser.add_argument('--data_dir', type=str, + default='./data/wikidata/data/', help='Data path') + parser.add_argument('--image_dir', type=str, + default='./data/images', help='Directory to store images') + parser.add_argument('--output_dir', type=str, + default='./data/qadata', help='Directory to store q&a processed') args = parser.parse_args() - main(args.fields, args.topics_directory, args.data_path, - args.image_directory, args.output_directory) + main(args.fields, args.topics_dir, args.data_dir, + args.image_dir, args.output_dir) diff --git a/dataengine/generate_topics.py b/dataengine/generate_topics.py index 9a22c9b..5c6edf2 100644 --- a/dataengine/generate_topics.py +++ b/dataengine/generate_topics.py @@ -45,9 +45,9 @@ def main(data_file_path, output_dir): if __name__ == '__main__': parser = argparse.ArgumentParser( description='Generate detailed lists of topics from subfields') - parser.add_argument('--data_file_path', type=str, default='input_fields_subfields.txt', + parser.add_argument('--data_file_path', type=str, default='./data/input_fields_subfields.txt', help='Path to the data file containing topics and subfields') - parser.add_argument('--output_dir', type=str, default='topics/', + parser.add_argument('--output_dir', type=str, default='./data/topics/', help='Directory to output the resulting JSON files') args = parser.parse_args() diff --git a/dataengine/generate_vqa.py b/dataengine/generate_vqa.py index df13d6a..42bbd33 100644 --- a/dataengine/generate_vqa.py +++ b/dataengine/generate_vqa.py @@ -3,14 +3,14 @@ import os -def main(topicname, path_to_images, qa_path, vqa_path): +def main(topicname, image_dir, qa_dir, vqa_dir): # Read existing files to determine the next starting number - os.makedirs(vqa_path, exist_ok=True) - path_to_images = f'{path_to_images}/{topicname}_images' - json_file_path = f'{qa_path}/{topicname}.json' - output_json_file_path = f'{vqa_path}/{topicname}.json' + os.makedirs(vqa_dir, exist_ok=True) + image_dir = f'{image_dir}/{topicname}_images' + json_file_path = f'{qa_dir}/{topicname}.json' + output_json_file_path = f'{vqa_dir}/{topicname}.json' - existing_files = os.listdir(path_to_images) + existing_files = os.listdir(image_dir) max_number = max((int(f.split('.')[0]) for f in existing_files if f.split('.')[ 0].isdigit() and f.endswith('.png')), default=0) print(f'Max number is {max_number}') @@ -45,8 +45,8 @@ def main(topicname, path_to_images, qa_path, vqa_path): # Optionally, rename the files in the directory (uncomment to use) for original_filename, new_filename in renamed_files.items(): - os.rename(os.path.join(path_to_images, original_filename), - os.path.join(path_to_images, new_filename)) + os.rename(os.path.join(image_dir, original_filename), + os.path.join(image_dir, new_filename)) if __name__ == "__main__": @@ -54,12 +54,12 @@ def main(topicname, path_to_images, qa_path, vqa_path): description='Process and rename images and update JSON data accordingly.') parser.add_argument('--topicname', type=str, default='Geology_and_Earth_Sciences', help='Name of the topic to process') - parser.add_argument('--path_to_images', type=str, - default='images', help='Path to the images directory') - parser.add_argument('--qa_path', type=str, default='qadata', - help='Path to the input qa JSON file') - parser.add_argument('--vqa_path', type=str, default='vqadata', - help='Path to the output vqa JSON file') + parser.add_argument('--image_dir', type=str, + default='./data/images', help='Path to the images directory') + parser.add_argument('--qa_dir', type=str, default='./data/qadata/', + help='Path to the input qa JSON file dir') + parser.add_argument('--vqa_dir', type=str, default='./data/vqadata', + help='Path to the output vqa JSON file dir') args = parser.parse_args() - main(args.topicname, args.path_to_images, args.qa_path, args.vqa_path) + main(args.topicname, args.image_dir, args.qa_dir, args.vqa_dir) diff --git a/dataengine/process_json_files.py b/dataengine/process_json_files.py index 22566cc..f307b18 100644 --- a/dataengine/process_json_files.py +++ b/dataengine/process_json_files.py @@ -50,5 +50,11 @@ def process_nested_json(json_data): if __name__ == '__main__': - directory = 'topics' - process_json_files(directory) + import argparse + parser = argparse.ArgumentParser( + description='Process topics JSON files in a directory') + parser.add_argument('--topics_dir', type=str, default='./data/topics/', + help='Directory of topics JSON files to process') + args = parser.parse_args() + + process_json_files(args.topics_dir) diff --git a/dataengine/wikiflow.py b/dataengine/wikiflow.py index d934b37..dd68d3e 100644 --- a/dataengine/wikiflow.py +++ b/dataengine/wikiflow.py @@ -169,20 +169,15 @@ def read_links_from_json_file(file_path, topic): return links -def read_topics_from_file(filename): - with open(filename, 'r') as file: - return json.load(file) - - -def main(topics_directory, links_path, data_path): +def main(topics_dir, links_dir, data_dir): # Modify the directory as needed # topics_directory = 'topics' # links_path = 'wikidata/wikilinks/' # data_path = 'wikidata/data/' - json_files = [os.path.join(topics_directory, f) for f in os.listdir( - topics_directory) if f.endswith('.json')] - os.makedirs(links_path, exist_ok=True) - os.makedirs(data_path, exist_ok=True) + json_files = [os.path.join(topics_dir, f) for f in os.listdir( + topics_dir) if f.endswith('.json')] + os.makedirs(links_dir, exist_ok=True) + os.makedirs(data_dir, exist_ok=True) skip = False print(json_files) for json_file in json_files: @@ -193,7 +188,7 @@ def main(topics_directory, links_path, data_path): for topic, subtopics in topics.items(): for subtopic in subtopics: # Define the path to check if the file exists - directory_path = f'{data_path}{topic}' + directory_path = f'{data_dir}{topic}' os.makedirs(directory_path, exist_ok=True) file_path = f'{directory_path}/{subtopic}.json' # Check if the file exists; if not, proceed with the following operations @@ -220,14 +215,14 @@ def main(topics_directory, links_path, data_path): title, file_base_name, topic, subtopic, links[i]) if dataset is not None: totaldataset.extend(dataset) - write_datalinks_to_file(topic, subtopic, links, links_path) + write_datalinks_to_file(topic, subtopic, links, links_dir) append_data_to_file( topic, subtopic, totaldataset, file_path) else: for topic, subtopics in topics.items(): for subtopic in subtopics: links = read_links_from_json_file( - f'{links_path}{file_base_name}.json', subtopic) + f'{links_dir}{file_base_name}.json', subtopic) titles = [title for title in titles] # print(links) totaldataset = [] @@ -235,7 +230,7 @@ def main(topics_directory, links_path, data_path): dataset = scrape_wikipedia( title, file_base_name, topic, subtopic) totaldataset.extend(dataset) - write_datalinks_to_file(topic, subtopic, links, links_path) + write_datalinks_to_file(topic, subtopic, links, links_dir) append_data_to_file( topic, subtopic, totaldataset, file_path) @@ -243,12 +238,12 @@ def main(topics_directory, links_path, data_path): if __name__ == '__main__': parser = argparse.ArgumentParser( description='Process Wikipedia data for various topics.') - parser.add_argument('--topics_directory', type=str, default='topics', + parser.add_argument('--topics_directory', type=str, default='./data/topics', help='Directory containing topics JSON files') - parser.add_argument('--links_path', type=str, - default='wikidata/wikilinks/', help='Directory to store links data') - parser.add_argument('--data_path', type=str, default='wikidata/data/', + parser.add_argument('--links_dir', type=str, + default='./data/wikidata/wikilinks/', help='Directory to store links data') + parser.add_argument('--data_dir', type=str, default='./data/wikidata/data/', help='Directory to store processed data') args = parser.parse_args() - main(args.topics_directory, args.links_path, args.data_path) + main(args.topics_directory, args.links_dir, args.data_dir)