Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion ambuda/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,14 @@ class Locale:
"saundaranandam",
"hamsadutam",
],
"upanishat": ["shivopanishat"],
"upanishat": [
"shivopanishat",
"isa"
],
"anye": [
"bodhicaryavatara",
"catuhshloki",

],
}

Expand Down
2 changes: 1 addition & 1 deletion ambuda/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# For convenience, import all models into this module.

from ambuda.enums import SiteRole # NOQA F401
from ambuda.enums import SitePageStatus, SiteRole, TextGenre # NOQA F401
from ambuda.models.auth import * # NOQA F401,F403
from ambuda.models.base import Base # NOQA F401,F403
from ambuda.models.blog import * # NOQA F401,F403
Expand Down
8 changes: 8 additions & 0 deletions ambuda/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ class SitePageStatus(str, Enum):
R2 = "reviewed-2"
#: Not relevant.
SKIP = "skip"

class TextGenre(str, Enum):
"""Define text genres."""

ITIHASA = "Itihasa"
UPANISHAT = "Upanishat"
KAVYA = "Kavya"
ANYE = "Anye"
6 changes: 4 additions & 2 deletions ambuda/models/texts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
- `TextBlock` is typically a verse or paragraph within a `TextSection`.
"""

from ambuda.models.base import Base, foreign_key, pk
from sqlalchemy import Column, Integer, String
from sqlalchemy import Text as _Text
from sqlalchemy.orm import relationship

from ambuda.models.base import Base, foreign_key, pk


class Text(Base):

Expand All @@ -30,6 +29,9 @@ class Text(Base):
header = Column(_Text)
#: An ordered list of the sections contained within this text.
sections = relationship("TextSection", backref="text", cascade="delete")
#: Add a genre for the text
genre_id = foreign_key("genres.id")
genre = relationship("Genre")

def __str__(self):
return self.slug
Expand Down
10 changes: 10 additions & 0 deletions ambuda/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ def texts() -> list[db.Text]:
return session.query(db.Text).all()


def texts_genre(genre=None) -> list[str]:
"""Return a list of all texts in a genre."""
session = get_session()
query = session.query(db.Text.slug).join(db.Genre) # Join the Text and Genre tables
if genre:
query = query.filter(db.Genre.name == genre.name) # Filter by genre name
texts = query.all()
return [text[0] for text in texts]


def page_statuses() -> list[db.PageStatus]:
session = get_session()
return session.query(db.PageStatus).all()
Expand Down
5 changes: 3 additions & 2 deletions ambuda/seed/lookup/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import create_bot_user, page_status, role
from . import create_bot_user, genres, page_status, role


def run():
Expand All @@ -9,9 +9,10 @@ def run():
page_status.run()
role.run()
create_bot_user.run()
genres.run()
except Exception as ex:
raise Exception(
"Error: Failed to create page statuses, "
"Error: Failed to genres, create page statuses, "
"create roles, and creat bot user."
f"Error: {ex}"
) from ex
35 changes: 35 additions & 0 deletions ambuda/seed/lookup/genres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import logging

import ambuda.database as db
from ambuda.enums import TextGenre
from ambuda.seed.utils.data_utils import create_db
from sqlalchemy.orm import Session


def run(engine=None):
"""Create roles iff they don't exist already.

NOTE: this script doesn't delete existing roles.
"""

engine = engine or create_db()
with Session(engine) as session:
genres = session.query(db.Genre).all()
existing_names = {s.name for s in genres}
new_names = {r.value for r in TextGenre if r.value not in existing_names}

if new_names:
for name in new_names:
status = db.Genre(name=name)
session.add(status)
logging.debug(f"Created genre: {name}")
session.commit()

logging.debug("Done. The following genres are defined:")
with Session(engine) as session:
for g in session.query(db.Genre).all():
logging.debug(f"- {g.name}")


if __name__ == "__main__":
run()
94 changes: 21 additions & 73 deletions ambuda/seed/texts/gretil.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,36 @@

import logging
import subprocess
from dataclasses import dataclass
from pathlib import Path

from sqlalchemy.orm import Session

import ambuda.database as db
from ambuda.seed.utils.data_utils import create_db
from ambuda.utils.tei_parser import Document, parse_document


@dataclass
class Spec:
slug: str
title: str
filename: str

from ambuda.enums import TextGenre
from ambuda.seed.utils.data_utils import Spec, add_document, create_db

REPO = "https://github.com/ambuda-org/gretil.git"
PROJECT_DIR = Path(__file__).resolve().parents[3]
DATA_DIR = PROJECT_DIR / "data" / "ambuda-gretil"
#: Slug to use for texts that have only one section.

ALLOW = [
Spec("amarushatakam", "amaruzatakam", "sa_amaru-amaruzataka.xml"),
Spec("kumarasambhavam", "kumArasambhavam", "sa_kAlidAsa-kumArasaMbhava.xml"),
Spec("raghuvamsham", "raghuvaMzam", "sa_kAlidAsa-raghuvaMza.xml"),
Spec("kiratarjuniyam", "kirAtArjunIyam", "sa_bhAravi-kirAtArjunIya.xml"),
Spec("shishupalavadham", "zizupAlavadham", "sa_mAgha-zizupAlavadha.xml"),
Spec("rtusamharam", "RtusaMhAram", "sa_kAlidAsa-RtusaMhAra.xml"),
Spec("shatakatrayam", "zatakatrayam", "sa_bhatRhari-zatakatraya.xml"),
Spec("bhattikavyam", "bhaTTikAvyam", "sa_bhaTTi-rAvaNavadha.xml"),
Spec("meghadutam-kale", "meghadUtam", "sa_kAlidAsa-meghadUta-edkale.xml"),
Spec("kokilasandesha", "kokilasaMdezaH", "sa_uddaNDa-kokilasaMdesa.xml"),
Spec("bodhicaryavatara", "bodhicaryAvatAraH", "sa_zAntideva-bodhicaryAvatAra.xml"),
Spec("amarushatakam", "amaruzatakam", "sa_amaru-amaruzataka.xml", TextGenre.KAVYA),
Spec("kumarasambhavam", "kumArasambhavam", "sa_kAlidAsa-kumArasaMbhava.xml", TextGenre.KAVYA),
Spec("raghuvamsham", "raghuvaMzam", "sa_kAlidAsa-raghuvaMza.xml", TextGenre.KAVYA),
Spec("kiratarjuniyam", "kirAtArjunIyam", "sa_bhAravi-kirAtArjunIya.xml", TextGenre.KAVYA),
Spec("shishupalavadham", "zizupAlavadham", "sa_mAgha-zizupAlavadha.xml", TextGenre.KAVYA),
Spec("rtusamharam", "RtusaMhAram", "sa_kAlidAsa-RtusaMhAra.xml", TextGenre.KAVYA),
Spec("shatakatrayam", "zatakatrayam", "sa_bhatRhari-zatakatraya.xml", TextGenre.KAVYA),
Spec("bhattikavyam", "bhaTTikAvyam", "sa_bhaTTi-rAvaNavadha.xml", TextGenre.KAVYA),
Spec("meghadutam-kale", "meghadUtam", "sa_kAlidAsa-meghadUta-edkale.xml", TextGenre.KAVYA),
Spec("kokilasandesha", "kokilasaMdezaH", "sa_uddaNDa-kokilasaMdesa.xml", TextGenre.KAVYA),
Spec("bodhicaryavatara", "bodhicaryAvatAraH", "sa_zAntideva-bodhicaryAvatAra.xml", TextGenre.ANYE),
Spec(
"saundaranandam", "saundaranandam", "sa_azvaghoSa-saundarAnanda-edmatsunami.xml"
"saundaranandam", "saundaranandam", "sa_azvaghoSa-saundarAnanda-edmatsunami.xml", TextGenre.KAVYA
),
Spec("caurapancashika", "caurapaJcAzikA", "sa_bilhaNa-caurapaJcAzikA.xml"),
Spec("hamsadutam", "haMsadUtam", "sa_rUpagosvAmin-haMsadUta.xml"),
Spec("mukundamala", "mukundamAlA", "sa_kulazekhara-mukundamAlA-eddurgaprasad.xml"),
Spec("shivopanishat", "zivopaniSat", "sa_zivopaniSad.xml"),
Spec("catuhshloki", "catuHzlokI", "sa_yAmuna-catuHzlokI.xml"),
Spec("caurapancashika", "caurapaJcAzikA", "sa_bilhaNa-caurapaJcAzikA.xml", TextGenre.KAVYA),
Spec("hamsadutam", "haMsadUtam", "sa_rUpagosvAmin-haMsadUta.xml", TextGenre.KAVYA),
Spec("mukundamala", "mukundamAlA", "sa_kulazekhara-mukundamAlA-eddurgaprasad.xml", TextGenre.KAVYA),
Spec("shivopanishat", "zivopaniSat", "sa_zivopaniSad.xml", TextGenre.UPANISHAT),
Spec("catuhshloki", "catuHzlokI", "sa_yAmuna-catuHzlokI.xml", TextGenre.ANYE),
]


Expand All @@ -62,47 +50,6 @@ def fetch_latest_data():
subprocess.call("git reset --hard origin/main", shell=True, cwd=DATA_DIR)


def _create_new_text(session, spec: Spec, document: Document):
text = db.Text(slug=spec.slug, title=spec.title, header=document.header)
session.add(text)
session.flush()

n = 1
for section in document.sections:
db_section = db.TextSection(
text_id=text.id, slug=section.slug, title=section.slug
)
session.add(db_section)
session.flush()

for block in section.blocks:
db_block = db.TextBlock(
text_id=text.id,
section_id=db_section.id,
slug=block.slug,
xml=block.blob,
n=n,
)
session.add(db_block)
n += 1

session.commit()


def add_document(engine, spec: Spec):
document_path = DATA_DIR / "1_sanskr" / "tei" / spec.filename

with Session(engine) as session:
if session.query(db.Text).filter_by(slug=spec.slug).first():
# FIXME: update existing texts in-place so that we can capture
# changes. As a workaround for now, we can delete then re-create.
log(f"- Skipped {spec.slug} (already exists)")
else:
document = parse_document(document_path)
_create_new_text(session, spec, document)
log(f"- Created {spec.slug}")


def run():
logging.getLogger().setLevel(0)
log("Downloading the latest data ...")
Expand All @@ -114,7 +61,8 @@ def run():
engine = create_db()

for spec in ALLOW:
add_document(engine, spec)
document_path = DATA_DIR / "1_sanskr" / "tei" / spec.filename
add_document(engine, spec, document_path)
except Exception as ex:
raise Exception("Error: Failed to get latest from GRETIL.") from ex

Expand Down
61 changes: 58 additions & 3 deletions ambuda/seed/utils/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
import hashlib
import io
import logging
import os
import zipfile

import requests
from sqlalchemy import create_engine
from dataclasses import dataclass

import config
import requests
from ambuda import database as db
from ambuda.seed.utils.itihasa_utils import CACHE_DIR
from ambuda.utils.tei_parser import Document, parse_document
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

LOG = logging.getLogger(__name__)

@dataclass
class Spec:
slug: str
title: str
filename: str
genre: db.TextGenre


def fetch_text(url: str, read_from_cache: bool = True) -> str:
Expand Down Expand Up @@ -83,3 +95,46 @@ def create_db():

db.Base.metadata.create_all(engine)
return engine


def _create_new_text(session, spec: Spec, document: Document):
"""Create new text in the database."""
text_genre = session.query(db.Genre).filter_by(name=spec.genre.value).first()
text = db.Text(slug=spec.slug, title=spec.title, header=document.header, genre=text_genre)
session.add(text)
session.flush()

n = 1
for section in document.sections:
db_section = db.TextSection(
text_id=text.id, slug=section.slug, title=section.slug
)
session.add(db_section)
session.flush()

for block in section.blocks:
db_block = db.TextBlock(
text_id=text.id,
section_id=db_section.id,
slug=block.slug,
xml=block.blob,
n=n,
)
session.add(db_block)
n += 1

session.commit()


def add_document(engine, spec: Spec, document_path):
" Add a document to the database. "

with Session(engine) as session: # noqa: F821
if session.query(db.Text).filter_by(slug=spec.slug).first():
# FIXME: update existing texts in-place so that we can capture
# changes. As a workaround for now, we can delete then re-create.
LOG.info(f"- Skipped {spec.slug} (already exists)") # noqa: F821
else:
document = parse_document(document_path)
_create_new_text(session, spec, document)
LOG.info(f"- Created {spec.slug}") # noqa: F821
6 changes: 3 additions & 3 deletions ambuda/seed/utils/itihasa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
from dataclasses import dataclass
from pathlib import Path

import ambuda.database as db
from dotenv import load_dotenv
from indic_transliteration import sanscript
from sqlalchemy.orm import Session

import ambuda.database as db

load_dotenv()
PROJECT_DIR = Path(__file__).parent.parent.parent
CACHE_DIR = PROJECT_DIR / "data" / "download-cache"
Expand Down Expand Up @@ -106,7 +105,8 @@ def write_kandas(
xml_id_prefix: str,
):
with Session(engine) as session:
text = db.Text(slug=text_slug, title=text_title, header=tei_header)
iti_genre = session.query(db.Genre).filter_by(name=db.TextGenre.ITIHASA.value).first()
text = db.Text(slug=text_slug, title=text_title, header=tei_header, genre=iti_genre)
session.add(text)
session.flush()

Expand Down
Loading