Skip to content

Commit

Permalink
test fix for Polanco pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
dansand committed May 23, 2024
1 parent ee59f69 commit c30fbcd
Showing 1 changed file with 63 additions and 7 deletions.
70 changes: 63 additions & 7 deletions .github/scripts/parse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,41 +153,97 @@ def get_funders(funder_list):
return funders, log


# def parse_image_and_caption(img_string, default_filename):
# log = ""
# image_record = {}
#
# md_regex = r"\[(?P<filename>.*?)\]\((?P<url>.*?)\)"
# html_regex = r'alt="(?P<filename>[^"]+)" src="(?P<url>[^"]+)"'
# pattern = re.compile(r"https://github.com/ModelAtlasofTheEarth/[^/]+/assets/")
#
# # Hack to recognise SVG files
# filetype.add_type(Svg())
#
# caption = []
#
# for string in img_string.split("\r\n"):
# #if "https://github.com/ModelAtlasofTheEarth/model_submission/assets/" in string:
# if pattern.search(string):
# try:
# image_record = re.search(md_regex, string).groupdict()
# except:
# if string.startswith("https://"):
# image_record = {"filename": default_filename, "url": string}
# elif "src" in string:
# image_record = re.search(html_regex, string).groupdict()
# else:
# log += "Error: Could not parse image file and caption\n"
# else:
# caption.append(string)
#
# # Get correct file extension for images
# if "url" in image_record:
# response = requests.get(image_record["url"])
# content_type = response.headers.get("Content-Type")[:5]
# if content_type in ["video", "image"]:
# image_record["filename"] += "." + filetype.get_type(mime=response.headers.get("Content-Type")).extension
#
# image_record["caption"] = "\n".join(caption)
#
# if not caption:
# log += "Error: No caption found for image.\n"
#
# return image_record, log


#Modification to deal with pdf better
#original function above
def parse_image_and_caption(img_string, default_filename):
log = ""
image_record = {}

# Regex to match Markdown image syntax
md_regex = r"\[(?P<filename>.*?)\]\((?P<url>.*?)\)"
# Regex to match HTML image syntax
html_regex = r'alt="(?P<filename>[^"]+)" src="(?P<url>[^"]+)"'
pattern = re.compile(r"https://github.com/ModelAtlasofTheEarth/[^/]+/assets/")
# Pattern to identify the URL structure
pattern = re.compile(r"https://github.com/ModelAtlasofTheEarth/[^/]+/assets/|https://github.com/ModelAtlasofTheEarth/[^/]+/files/")

# Hack to recognise SVG files
# Adding support for SVG files
filetype.add_type(Svg())

caption = []

# Split the input string by line
for string in img_string.split("\r\n"):
#if "https://github.com/ModelAtlasofTheEarth/model_submission/assets/" in string:
# Check if the line contains the expected URL pattern
if pattern.search(string):
try:
# Try to match the Markdown image format
image_record = re.search(md_regex, string).groupdict()
except:
if string.startswith("https://"):
# If it is a URL, use the default filename
image_record = {"filename": default_filename, "url": string}
elif "src" in string:
# Try to match the HTML image format
image_record = re.search(html_regex, string).groupdict()
else:
log += "Error: Could not parse image file and caption\n"
else:
caption.append(string)

# Get correct file extension for images
# If the file is not an image but a document (e.g., PDF), handle it separately
if "url" in image_record:
response = requests.get(image_record["url"])
content_type = response.headers.get("Content-Type")[:5]
if content_type in ["video", "image"]:
image_record["filename"] += "." + filetype.get_type(mime=response.headers.get("Content-Type")).extension
content_type = response.headers.get("Content-Type")
if content_type.startswith("image"):
image_record["filename"] += "." + filetype.get_type(mime=content_type).extension
else:
# For non-image files, use the original filename without modification
image_record["filename"] = image_record["filename"]

# Join the collected caption lines into a single string
image_record["caption"] = "\n".join(caption)

if not caption:
Expand Down

0 comments on commit c30fbcd

Please sign in to comment.