Skip to content

Commit

Permalink
changed file to a many-many relationship model
Browse files Browse the repository at this point in the history
  • Loading branch information
nevoodoo committed Jan 29, 2024
1 parent 1ce8f9c commit 5fd3f60
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 34 deletions.
23 changes: 16 additions & 7 deletions db/project.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1113,13 +1113,6 @@
<column name="id" type="INT" autoIncrement="true">
<constraints primaryKey="true" nullable="false" />
</column>
<column name="analysis_id" type="INT">
<constraints
nullable="true"
foreignKeyName="fk_file_analysis"
references="analysis(id)"
/>
</column>
<column name="path" type="VARCHAR(255)">
<constraints
nullable="false"
Expand Down Expand Up @@ -1162,6 +1155,22 @@
/>
</column>
</createTable>
<createTable tableName="analysis_file">
<column name="analysis_id" type="INT">
<constraints
nullable="false"
foreignKeyName="FK_ANALYSIS_FILE_ANALYSIS_ID"
references="analysis(id)"
/>
</column>
<column name="file_id" type="INT">
<constraints
nullable="false"
foreignKeyName="FK_ANALYSIS_FILE_FILE_ID"
references="file(id)"
/>
</column>
</createTable>
<sql>ALTER TABLE `file` ADD SYSTEM VERSIONING;</sql>
</changeSet>
</databaseChangeLog>
103 changes: 102 additions & 1 deletion models/models/file.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import hashlib
from typing import Generator
import json
from collections import defaultdict

from cloudpathlib.anypath import AnyPath
from pydantic import BaseModel

from models.base import SMBase


class File(SMBase):
class FileInternal(SMBase):
"""File model for internal use"""

id: int
Expand All @@ -17,6 +21,7 @@ class File(SMBase):
nameext: str | None
checksum: str | None
size: int
json_path: str | None = None
secondary_files: list[str] = []

@staticmethod
Expand All @@ -32,6 +37,7 @@ def from_db(**kwargs):
nameext = kwargs.get('nameext')
checksum = kwargs.get('checksum')
size = kwargs.get('size')
json_path = kwargs.get('json_path')
secondary_files = kwargs.get('secondary_files')

return File(
Expand All @@ -44,9 +50,27 @@ def from_db(**kwargs):
nameext=nameext,
checksum=checksum,
size=size,
json_path=json_path,
secondary_files=secondary_files,
)

def to_external(self):
"""
Convert to external model
"""
return File(
id=self.id,
analysis_id=self.analysis_id,
path=self.path,
basename=self.basename,
dirname=self.dirname,
nameroot=self.nameroot,
nameext=self.nameext,
checksum=self.checksum,
size=self.size,
secondary_files=self.secondary_files,
)

@staticmethod
def get_basename(path: str) -> str:
"""Get file basename for file at given path"""
Expand Down Expand Up @@ -82,3 +106,80 @@ def get_size(path: str) -> int:
return AnyPath(path).stat().st_size # pylint: disable=E1101
except FileNotFoundError:
return 0

@staticmethod
def find_files_from_dict(json_dict: dict, path=None) -> Generator[tuple[str, dict], None, None]:
"""Retrieve filepaths from a dict of outputs"""
if not path:
path = []

if isinstance(json_dict, dict):
for key, value in json_dict.items():
if isinstance(value, dict):
# If the value is a dictionary, continue the recursion
yield from FileInternal.find_files_from_dict(value, path + [key])
else:
# Found a leaf, yield the path and the value
yield path, json_dict
else:
# If the input is not a dictionary, just return the value
yield path, json_dict

@staticmethod
def reconstruct_json(data) -> dict:
"""Reconstruct a JSON object from a list of paths and values"""
root: dict = defaultdict(dict)
for path_str, content_str in data:
# Split the path into components and parse the JSON content
path = path_str.split('.')

try:
content = json.loads(content_str)
except json.JSONDecodeError:
print(f'Error parsing JSON: {content_str}')

# Navigate down the tree to the correct position, creating dictionaries as needed
current = root
for key in path[:-1]:
current = current.setdefault(key, {})

if path[-1] in current:
current[path[-1]].update(content)
else:
current[path[-1]] = content

return root


class File(BaseModel):
"""File model for external use"""

id: int
analysis_id: int
path: str
basename: str
dirname: AnyPath
nameroot: str
nameext: str | None
checksum: str | None
size: int
json_path: str | None = None
secondary_files: list[str] = []

def to_internal(self):
"""
Convert to internal model
"""
return FileInternal(
id=self.id,
analysis_id=self.analysis_id,
path=self.path,
basename=self.basename,
dirname=self.dirname,
nameroot=self.nameroot,
nameext=self.nameext,
checksum=self.checksum,
size=self.size,
json_path=self.json_path,
secondary_files=self.secondary_files,
)
61 changes: 35 additions & 26 deletions scripts/20240124_migrate_output_to_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import click
from databases import Database

from models.models.file import File
from models.models.file import FileInternal


def _get_connection_string():
Expand All @@ -27,8 +27,9 @@ async def get_analyses_without_fileid(connection):
"""
SELECT a.id, a.output
FROM analysis a
LEFT JOIN file f ON f.analysis_id = a.id
WHERE f.analysis_id IS NULL
LEFT JOIN analysis_file af ON af.analysis_id = a.id
LEFT JOIN file f ON f.id = af.file_id
WHERE f.id IS NULL
"""
)
print('Fetching...')
Expand All @@ -38,25 +39,22 @@ async def get_analyses_without_fileid(connection):
return rows


async def execute_many(connection, query, inserts):
"""Executes many inserts"""
print(f'Inserting {len(inserts)} with query: {query}')
async def execute(connection, query, inserts):
"""Executes inserts"""
await connection.execute(query, inserts)

await connection.execute_many(query, inserts)


def get_file_info(path: str, analysis_id: int) -> Dict:
def get_file_info(path: str) -> Dict:
"""Get file dict"""
print('Extracting file dict')
return {
'analysis_id': analysis_id,
'path': path,
'basename': File.get_basename(path),
'dirname': File.get_dirname(path),
'nameroot': File.get_nameroot(path),
'nameext': File.get_extension(path),
'checksum': File.get_checksum(path),
'size': File.get_size(path),
'basename': FileInternal.get_basename(path),
'dirname': FileInternal.get_dirname(path),
'nameroot': FileInternal.get_nameroot(path),
'nameext': FileInternal.get_extension(path),
'checksum': FileInternal.get_checksum(path),
'size': FileInternal.get_size(path),
'secondary_files': '[]',
}

Expand Down Expand Up @@ -92,26 +90,37 @@ async def prepare_files(analyses):
if path_dict:
for _, path in path_dict.items():
print(path)
files.append(
get_file_info(path=path, analysis_id=analysis['id'])
)
files.append((
analysis['id'],
get_file_info(path=path)
))
print('Extracted and added.')
return files


async def insert_files(connection, files):
"""Insert files"""
query = dedent(
"""INSERT INTO file (path, analysis_id, basename, dirname, nameroot, nameext, checksum, size, secondary_files)
VALUES (:path, :analysis_id, :basename, :dirname, :nameroot, :nameext, :checksum, :size, :secondary_files)
"""INSERT INTO file (path, basename, dirname, nameroot, nameext, checksum, size, secondary_files)
VALUES (:path, :basename, :dirname, :nameroot, :nameext, :checksum, :size, :secondary_files)
RETURNING id"""
)
print('Inserting...')
await execute_many(
connection=connection,
query=query,
inserts=files,
af_query = dedent(
"""
INSERT INTO analysis_file (analysis_id, file_id) VALUES (:analysis_id, :file_id)
"""
)
for analysis_id, file in files:
print('Inserting...')
file_id = await connection.fetch_val(
query,
file,
)
await execute(
connection=connection,
query=af_query,
inserts={'analysis_id': analysis_id, 'file_id': file_id},
)
print(f'Inserted {len(files)} files')


Expand Down

0 comments on commit 5fd3f60

Please sign in to comment.