Skip to content

Commit

Permalink
[#8] file metadata - media object to optionally use base url for cont…
Browse files Browse the repository at this point in the history
…entUrl field
  • Loading branch information
pkdash committed Jun 24, 2024
1 parent f43d986 commit 866842d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 9 deletions.
18 changes: 14 additions & 4 deletions hsextract/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import hashlib
import os
import mimetypes

import os
from functools import partial
from typing import Optional


def file_metadata(path: str):
def file_metadata(path: str, base_url: Optional[str] = None):
# path = "/files/" + path
with open(path, "rb") as f:
d = hashlib.sha256()
Expand All @@ -15,8 +15,18 @@ def file_metadata(path: str):
size = f"{os.path.getsize(path)/1000.00} KB"
mime_type = mimetypes.guess_type(path)[0]
file_name = os.path.basename(path)
# strip the mount location from the path - assuming files are volume mounted at /files
if path.startswith("/files/"):
path = path[7:]
if base_url is not None:
base_url = base_url.rstrip("/")
path = path.lstrip("/")
file_url = os.path.join(base_url, path)
else:
file_url = path

file_meta = {
"contentUrl": path,
"contentUrl": file_url,
"contentSize": size,
"sha256": checksum,
"encodingFormat": mime_type,
Expand Down
13 changes: 8 additions & 5 deletions hsextract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def save_metadata(path: str, metadata_dict: dict):
with open(metadata_file_path, "w") as f:
json.dump(metadata_dict, f, indent=2)

print(f"Metadata was saved to file path: {metadata_file_path}")
message = f"Metadata was saved to file path: {metadata_file_path}"
print(message, flush=True)
logging.info(message)


def _to_metadata_path(filepath: str, user_metadata_filename: str):
Expand All @@ -79,12 +81,13 @@ def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_file
return filepath, extracted_metadata is not None


def extract_metadata(type: str, filepath, use_adapter=True):
def extract_metadata(type: str, filepath: str, base_url: Optional[str] = None, use_adapter=True):
# use_adapter is a flag to determine if the metadata should be converted to a catalog record
# it is set to False in tests when testing for the raw extracted metadata

print(f">> Extracting {type} metadata from {filepath}", flush=True)
logging.info(f"Extracting {type} metadata from {filepath}")
message = f"Extracting {type} metadata from {filepath}"
print(message, flush=True)
logging.info(message)

extension = os.path.splitext(filepath)[1]
try:
Expand All @@ -95,7 +98,7 @@ def extract_metadata(type: str, filepath, use_adapter=True):
adapter = HydroshareMetadataAdapter()
all_file_metadata = []
for f in extracted_metadata["content_files"]:
f_md, _ = file_metadata(f)
f_md, _ = file_metadata(path=f, base_url=base_url)
all_file_metadata.append(f_md)
del extracted_metadata["content_files"]
if type == "user_meta":
Expand Down

0 comments on commit 866842d

Please sign in to comment.