Skip to content

Commit

Permalink
cleanup script args
Browse files Browse the repository at this point in the history
  • Loading branch information
ellisbrown committed Jun 30, 2024
1 parent 43b0ed0 commit 3452072
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 60 deletions.
1 change: 1 addition & 0 deletions dataengine/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/
28 changes: 21 additions & 7 deletions dataengine/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,28 @@ OPENAI_API_KEY="your_openai_key"
GOOGLE_API_KEY="your_google_api_key"
GOOGLE_SE_ID="your_google_search_engine_id"
USER_AGENT="your_user_agent"
WIKIPEDIA_USER_AGENT="<client name>/<version> (<contact information>)" # https://foundation.wikimedia.org/wiki/Policy:User-Agent_policy
# https://foundation.wikimedia.org/wiki/Policy:User-Agent_policy
WIKIPEDIA_USER_AGENT="<client name>/<version> (<contact information>)"

python generate_topics.py &&
python process_json_files.py &&
python clean_and_rename_files.py &&
python wikiflow.py &&
python generate_qa.py &&
python generate_vqa.py
# set args for the scripts
DATA_DIR="./data/"
IN_FILE="${DATA_DIR}/input_fields_subfields.txt"
TOPICS_DIR="${DATA_DIR}/topics/"

WIKI_DIR="${DATA_DIR}/wikidata/"
WIKI_LINKS_DIR="${WIKI_DIR}/wikilinks/"
WIKI_DATA_DIR="${WIKI_DIR}/data/"

IMAGE_DIR="${DATA_DIR}/images/"
QA_DIR="${DATA_DIR}/qadata/"
VQA_DIR="${DATA_DIR}/vqa/"

python generate_topics.py --data_file_path $IN_FILE --output_dir $TOPICS_DIR
python process_json_files.py --topics_dir $TOPICS_DIR
python clean_and_rename_files.py --topics_dir $TOPICS_DIR
python wikiflow.py --topics_dir $TOPICS_DIR --links_dir $WIKI_LINKS_DIR --data_dir $WIKI_DATA_DIR
python generate_qa.py --topics_dir $TOPICS_DIR --data_dir $WIKI_DATA_DIR --output_dir $QA_DIR --image_dir $IMAGE_DIR
python generate_vqa.py --image_dir $IMAGE_DIR --qa_dir $QA_DIR --vqa_dir $VQA_DIR
```

## Explanation
Expand Down
12 changes: 9 additions & 3 deletions dataengine/clean_and_rename_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ def rename_files(directory):


if __name__ == '__main__':
directory_path = 'topics'
remove_non_post_files(directory_path)
rename_files(directory_path)
import argparse
parser = argparse.ArgumentParser(
description='Clean and rename topics JSON files in a directory')
parser.add_argument('--topics_dir', type=str, default='./data/topics/',
help='Directory of topics JSON files to process')
args = parser.parse_args()

remove_non_post_files(args.topics_dir)
rename_files(args.topics_dir)
24 changes: 12 additions & 12 deletions dataengine/generate_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ def worker(data_chunk, proc_index, result_list, image_directory):
result_list.append(result)


def main(fields, topics_directory, data_path, image_directory, output_directory):
def main(fields, topics_directory, data_dir, image_directory, output_directory):
os.makedirs(image_directory, exist_ok=True)
os.makedirs(output_directory, exist_ok=True)
for field in fields:
image_directory = f'{args.image_directory}/{field}_images'
os.makedirs(image_directory, exist_ok=True)
id = len(glob.glob(os.path.join(image_directory, '*')))
print(f'starting images with id {id}')
topic_files = get_topic_files(data_path, field, topics_directory)
topic_files = get_topic_files(data_dir, field, topics_directory)
filtered_data = get_data(topic_files)
processed_data_file_path = f'{output_directory}/{field}.json'
print(processed_data_file_path)
Expand Down Expand Up @@ -226,15 +226,15 @@ def main(fields, topics_directory, data_path, image_directory, output_directory)
parser = argparse.ArgumentParser(description='Process some topics.')
parser.add_argument('--fields', nargs='+', default=[
'Renewable_Energy_and_Sustainability', 'Geology_and_Earth_Sciences'], help='List of fields files')
parser.add_argument('--topics_directory', type=str,
default='topics', help='Directory of topics')
parser.add_argument('--data_path', type=str,
default='wikidata/data/', help='Data path')
parser.add_argument('--image_directory', type=str,
default='images', help='Directory to store images')
parser.add_argument('--output_directory', type=str,
default='qadata', help='Directory to store q&a processed')
parser.add_argument('--topics_dir', type=str,
default='./data/topics', help='Directory of topics')
parser.add_argument('--data_dir', type=str,
default='./data/wikidata/data/', help='Data path')
parser.add_argument('--image_dir', type=str,
default='./data/images', help='Directory to store images')
parser.add_argument('--output_dir', type=str,
default='./data/qadata', help='Directory to store q&a processed')
args = parser.parse_args()

main(args.fields, args.topics_directory, args.data_path,
args.image_directory, args.output_directory)
main(args.fields, args.topics_dir, args.data_dir,
args.image_dir, args.output_dir)
4 changes: 2 additions & 2 deletions dataengine/generate_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def main(data_file_path, output_dir):
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate detailed lists of topics from subfields')
parser.add_argument('--data_file_path', type=str, default='input_fields_subfields.txt',
parser.add_argument('--data_file_path', type=str, default='./data/input_fields_subfields.txt',
help='Path to the data file containing topics and subfields')
parser.add_argument('--output_dir', type=str, default='topics/',
parser.add_argument('--output_dir', type=str, default='./data/topics/',
help='Directory to output the resulting JSON files')
args = parser.parse_args()

Expand Down
30 changes: 15 additions & 15 deletions dataengine/generate_vqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import os


def main(topicname, path_to_images, qa_path, vqa_path):
def main(topicname, image_dir, qa_dir, vqa_dir):
# Read existing files to determine the next starting number
os.makedirs(vqa_path, exist_ok=True)
path_to_images = f'{path_to_images}/{topicname}_images'
json_file_path = f'{qa_path}/{topicname}.json'
output_json_file_path = f'{vqa_path}/{topicname}.json'
os.makedirs(vqa_dir, exist_ok=True)
image_dir = f'{image_dir}/{topicname}_images'
json_file_path = f'{qa_dir}/{topicname}.json'
output_json_file_path = f'{vqa_dir}/{topicname}.json'

existing_files = os.listdir(path_to_images)
existing_files = os.listdir(image_dir)
max_number = max((int(f.split('.')[0]) for f in existing_files if f.split('.')[
0].isdigit() and f.endswith('.png')), default=0)
print(f'Max number is {max_number}')
Expand Down Expand Up @@ -45,21 +45,21 @@ def main(topicname, path_to_images, qa_path, vqa_path):

# Optionally, rename the files in the directory (uncomment to use)
for original_filename, new_filename in renamed_files.items():
os.rename(os.path.join(path_to_images, original_filename),
os.path.join(path_to_images, new_filename))
os.rename(os.path.join(image_dir, original_filename),
os.path.join(image_dir, new_filename))


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Process and rename images and update JSON data accordingly.')
parser.add_argument('--topicname', type=str,
default='Geology_and_Earth_Sciences', help='Name of the topic to process')
parser.add_argument('--path_to_images', type=str,
default='images', help='Path to the images directory')
parser.add_argument('--qa_path', type=str, default='qadata',
help='Path to the input qa JSON file')
parser.add_argument('--vqa_path', type=str, default='vqadata',
help='Path to the output vqa JSON file')
parser.add_argument('--image_dir', type=str,
default='./data/images', help='Path to the images directory')
parser.add_argument('--qa_dir', type=str, default='./data/qadata/',
help='Path to the input qa JSON file dir')
parser.add_argument('--vqa_dir', type=str, default='./data/vqadata',
help='Path to the output vqa JSON file dir')
args = parser.parse_args()

main(args.topicname, args.path_to_images, args.qa_path, args.vqa_path)
main(args.topicname, args.image_dir, args.qa_dir, args.vqa_dir)
10 changes: 8 additions & 2 deletions dataengine/process_json_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,11 @@ def process_nested_json(json_data):


if __name__ == '__main__':
directory = 'topics'
process_json_files(directory)
import argparse
parser = argparse.ArgumentParser(
description='Process topics JSON files in a directory')
parser.add_argument('--topics_dir', type=str, default='./data/topics/',
help='Directory of topics JSON files to process')
args = parser.parse_args()

process_json_files(args.topics_dir)
33 changes: 14 additions & 19 deletions dataengine/wikiflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,20 +169,15 @@ def read_links_from_json_file(file_path, topic):
return links


def read_topics_from_file(filename):
with open(filename, 'r') as file:
return json.load(file)


def main(topics_directory, links_path, data_path):
def main(topics_dir, links_dir, data_dir):
# Modify the directory as needed
# topics_directory = 'topics'
# links_path = 'wikidata/wikilinks/'
# data_path = 'wikidata/data/'
json_files = [os.path.join(topics_directory, f) for f in os.listdir(
topics_directory) if f.endswith('.json')]
os.makedirs(links_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)
json_files = [os.path.join(topics_dir, f) for f in os.listdir(
topics_dir) if f.endswith('.json')]
os.makedirs(links_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)
skip = False
print(json_files)
for json_file in json_files:
Expand All @@ -193,7 +188,7 @@ def main(topics_directory, links_path, data_path):
for topic, subtopics in topics.items():
for subtopic in subtopics:
# Define the path to check if the file exists
directory_path = f'{data_path}{topic}'
directory_path = f'{data_dir}{topic}'
os.makedirs(directory_path, exist_ok=True)
file_path = f'{directory_path}/{subtopic}.json'
# Check if the file exists; if not, proceed with the following operations
Expand All @@ -220,35 +215,35 @@ def main(topics_directory, links_path, data_path):
title, file_base_name, topic, subtopic, links[i])
if dataset is not None:
totaldataset.extend(dataset)
write_datalinks_to_file(topic, subtopic, links, links_path)
write_datalinks_to_file(topic, subtopic, links, links_dir)
append_data_to_file(
topic, subtopic, totaldataset, file_path)
else:
for topic, subtopics in topics.items():
for subtopic in subtopics:
links = read_links_from_json_file(
f'{links_path}{file_base_name}.json', subtopic)
f'{links_dir}{file_base_name}.json', subtopic)
titles = [title for title in titles]
# print(links)
totaldataset = []
for title in titles:
dataset = scrape_wikipedia(
title, file_base_name, topic, subtopic)
totaldataset.extend(dataset)
write_datalinks_to_file(topic, subtopic, links, links_path)
write_datalinks_to_file(topic, subtopic, links, links_dir)
append_data_to_file(
topic, subtopic, totaldataset, file_path)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Process Wikipedia data for various topics.')
parser.add_argument('--topics_directory', type=str, default='topics',
parser.add_argument('--topics_directory', type=str, default='./data/topics',
help='Directory containing topics JSON files')
parser.add_argument('--links_path', type=str,
default='wikidata/wikilinks/', help='Directory to store links data')
parser.add_argument('--data_path', type=str, default='wikidata/data/',
parser.add_argument('--links_dir', type=str,
default='./data/wikidata/wikilinks/', help='Directory to store links data')
parser.add_argument('--data_dir', type=str, default='./data/wikidata/data/',
help='Directory to store processed data')
args = parser.parse_args()

main(args.topics_directory, args.links_path, args.data_path)
main(args.topics_directory, args.links_dir, args.data_dir)

0 comments on commit 3452072

Please sign in to comment.