Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ Verify that the following are valid
* ...

## Other Information
<!-- Add any other helpful information that may be needed here. -->
<!-- Add any other helpful information that may be needed here. -->
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -447,4 +447,4 @@ apps/whisper_fine_tuning/data/

data/

predictions_dir/
predictions_dir/
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 24.10.0
hooks:
- id: black
language_version: python3.11 # require Python 3.11 or newer
language_version: python3.12 # require Python 3.12 or newer

- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.1.0
Expand Down Expand Up @@ -41,4 +41,4 @@ repos:
- --disable-error-code=used-before-def
- --disable-error-code=attr-defined
files: \.py$
language_version: python3.11
language_version: python3.12
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ chances of your issue being dealt with quickly:
* **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
causing the problem (line of code or commit)

You can file new issues by providing the above information at the corresponding repository's issues link:
You can file new issues by providing the above information at the corresponding repository's issues link:
replace`[organization-name]` and `[repository-name]` in
`https://github.com/[organization-name]/[repository-name]/issues/new` .

Expand Down
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
SOFTWARE
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,4 +294,3 @@ Feel free to open issues for missing docs or suggested improvements.
---

<sub>Maintained by the community with ❤️. Contributions welcome.</sub>

18 changes: 11 additions & 7 deletions apps/text_generation/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
__author__ = "AI Apps GBB Team"
__version__ = "0.1.0"

import os
import logging
import os
import sys
from logging.handlers import RotatingFileHandler

try: # pragma: no cover - optional dependency for local tooling
from dotenv import load_dotenv # type: ignore[import-not-found]
except ImportError: # pragma: no cover - provide fallback

def load_dotenv(*args, **kwargs): # type: ignore[override]
return False

Expand All @@ -19,26 +20,29 @@ def setup_logging() -> logging.Logger:
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler(sys.stdout)
console_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_format = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
console_handler.setFormatter(console_format)

file_handler = RotatingFileHandler(
"app.log", maxBytes=10*1024*1024, backupCount=5
"app.log", maxBytes=10 * 1024 * 1024, backupCount=5
)
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
file_handler.setFormatter(file_format)

logger.addHandler(console_handler)
logger.addHandler(file_handler)

return logger


logger = setup_logging()
logger.info(f"{__app__} - {__author__} - Version: {__version__} initialized")

env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.env')
env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
if not load_dotenv(dotenv_path=env_path): # pragma: no cover - optional env file
logger.debug("dotenv skipped or not found at %s", env_path)
logger.debug("dotenv skipped or not found at %s", env_path)
188 changes: 96 additions & 92 deletions apps/text_generation/src/blob.py
Original file line number Diff line number Diff line change
@@ -1,115 +1,119 @@
from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Optional, Tuple, Any, BinaryIO
import os
import tempfile
import shutil
import pathlib

from fastapi import UploadFile
import shutil
import tempfile
from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Any, Optional, Tuple

from azure.identity import DefaultAzureCredential # type: ignore[import-not-found]
from azure.storage.blob import BlobServiceClient # type: ignore[import-not-found]
from fastapi import UploadFile

from config.settings import BlobConfig


LANGUAGES = ["matis", "mayuruna", "katukina"]
TYPES = ["paper", "book", "dictionary"]


class BlobUploaderPrototype(ABC):
"""Prototype interface for blob uploaders."""
"""Prototype interface for blob uploaders."""

@abstractmethod
def clone(self) -> Any:
"""Create a copy of this uploader instance."""
@abstractmethod
def clone(self) -> Any:
"""Create a copy of this uploader instance."""

@abstractmethod
def upload_pdf(self, file_path: str) -> str:
"""Upload a PDF and return the blob path.
@abstractmethod
def upload_pdf(self, file_path: str) -> str:
"""Upload a PDF and return the blob path.

Args:
file_path: Local path to the PDF file.
Args:
file_path: Local path to the PDF file.

Returns:
The path of the uploaded blob inside the container.
"""
Returns:
The path of the uploaded blob inside the container.
"""


class BlobStorageUploader(BlobUploaderPrototype):
"""Uploader that sends PDF files to Azure Blob Storage organized by language and type.

The class implements the Prototype pattern via `clone()` so different uploader
instances can be created from a base configuration. For easier testing, a
container client may be injected directly.

Args:
config: `BlobConfig` containing connection details.
container_client: Optional pre-created container client for testing or
advanced scenarios. If None, the client is created from the
connection string in `config`.
"""

def __init__(self, config: BlobConfig, container_client: Optional[Any] = None):
self.config = config
if container_client is not None:
# Allow dependency injection for testing
self.container_client = container_client
self.client = None
else:
if BlobServiceClient is None or DefaultAzureCredential is None:
raise RuntimeError("Azure Blob dependencies are required when no container client is provided")
credential = DefaultAzureCredential()
self.client = BlobServiceClient(account_url=self.config.account_url, credential=credential)
self.container_client = self.client.get_container_client(self.config.container_name)

def clone(self) -> "BlobStorageUploader":
"""Return a deep copy of this uploader (including config).

Note: cloned uploader will keep the same container client reference.
"""

return deepcopy(self)

def infer_language_and_type(self, filename: str) -> Tuple[str, str]:
"""Infer language and document type from a filename.

Simple heuristic: find the first known language and type contained in the
lowercased filename. Falls back to 'unknown' and 'other'.

Args:
filename: Name of the file to inspect.

Returns:
A tuple (language, doc_type).
"""

lower = filename.lower()
language = next((lang for lang in LANGUAGES if lang in lower), "unknown")
doc_type = next((typ for typ in TYPES if typ in lower), "other")
return language, doc_type

def upload_pdf(self, file_path: str) -> str:
"""Upload a PDF file to the container under language/type folders.

The blob path will be: <language>/<type>/<filename>

Args:
file_path: Local path to the PDF file.

Returns:
The blob path used for the uploaded file.
"""

filename = os.path.basename(file_path)
language, doc_type = self.infer_language_and_type(filename)
blob_path = f"{language}/{doc_type}/{filename}"
# Open the file in binary mode and upload. The container client is
# expected to implement upload_blob(blob_path, data, overwrite=True).
with open(file_path, "rb") as data:
self.container_client.upload_blob(blob_path, data, overwrite=True)
return blob_path
"""Uploader that sends PDF files to Azure Blob Storage organized by language and type.

The class implements the Prototype pattern via `clone()` so different uploader
instances can be created from a base configuration. For easier testing, a
container client may be injected directly.

Args:
config: `BlobConfig` containing connection details.
container_client: Optional pre-created container client for testing or
advanced scenarios. If None, the client is created from the
connection string in `config`.
"""

def __init__(self, config: BlobConfig, container_client: Optional[Any] = None):
self.config = config
if container_client is not None:
# Allow dependency injection for testing
self.container_client = container_client
self.client = None
else:
if BlobServiceClient is None or DefaultAzureCredential is None:
raise RuntimeError(
"Azure Blob dependencies are required when no container client is provided"
)
credential = DefaultAzureCredential()
self.client = BlobServiceClient(
account_url=self.config.account_url, credential=credential
)
self.container_client = self.client.get_container_client(
self.config.container_name
)

def clone(self) -> "BlobStorageUploader":
"""Return a deep copy of this uploader (including config).

Note: cloned uploader will keep the same container client reference.
"""

return deepcopy(self)

def infer_language_and_type(self, filename: str) -> Tuple[str, str]:
"""Infer language and document type from a filename.

Simple heuristic: find the first known language and type contained in the
lowercased filename. Falls back to 'unknown' and 'other'.

Args:
filename: Name of the file to inspect.

Returns:
A tuple (language, doc_type).
"""

lower = filename.lower()
language = next((lang for lang in LANGUAGES if lang in lower), "unknown")
doc_type = next((typ for typ in TYPES if typ in lower), "other")
return language, doc_type

def upload_pdf(self, file_path: str) -> str:
"""Upload a PDF file to the container under language/type folders.

The blob path will be: <language>/<type>/<filename>

Args:
file_path: Local path to the PDF file.

Returns:
The blob path used for the uploaded file.
"""

filename = os.path.basename(file_path)
language, doc_type = self.infer_language_and_type(filename)
blob_path = f"{language}/{doc_type}/{filename}"
# Open the file in binary mode and upload. The container client is
# expected to implement upload_blob(blob_path, data, overwrite=True).
with open(file_path, "rb") as data:
self.container_client.upload_blob(blob_path, data, overwrite=True)
return blob_path


def make_blob_uploader() -> BlobStorageUploader:
Expand Down
Loading