diff --git a/Makefile b/Makefile index b13c21fd..6da22b57 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,7 @@ db_seed_all: python -m ambuda.seed.lookup.role python -m ambuda.seed.lookup.page_status python -m ambuda.seed.texts.gretil + python -m ambuda.seed.texts.sarit python -m ambuda.seed.texts.ramayana python -m ambuda.seed.texts.mahabharata python -m ambuda.seed.dcs diff --git a/ambuda/consts.py b/ambuda/consts.py index 9122bf34..572725fe 100644 --- a/ambuda/consts.py +++ b/ambuda/consts.py @@ -18,6 +18,7 @@ "saundaranandam", "hamsadutam", ], + "purana": ["skandapuranam"], "anye": [ "bodhicaryavatara", ], diff --git a/ambuda/seed/texts/gretil.py b/ambuda/seed/texts/gretil.py index 3d66a950..f57454fe 100644 --- a/ambuda/seed/texts/gretil.py +++ b/ambuda/seed/texts/gretil.py @@ -23,6 +23,7 @@ class Spec: REPO = "https://github.com/ambuda-org/gretil.git" +BRANCH = "main" PROJECT_DIR = Path(__file__).resolve().parents[3] DATA_DIR = PROJECT_DIR / "data" / "ambuda-gretil" #: Slug to use for texts that have only one section. @@ -57,15 +58,15 @@ class Spec: } -def fetch_latest_data(): +def fetch_latest_data(repo: str, branch: str, data_dir: str): """Fetch the latest data from our GitHub repo.""" - if not DATA_DIR.exists(): - subprocess.run(f"mkdir -p {DATA_DIR}", shell=True) - subprocess.run(f"git clone --branch=main {REPO} {DATA_DIR}", shell=True) + if not data_dir.exists(): + subprocess.run(f"mkdir -p {data_dir}", shell=True) + subprocess.run(f"git clone --branch={branch} {repo} {data_dir}", shell=True) - subprocess.call("git fetch origin", shell=True, cwd=DATA_DIR) - subprocess.call("git checkout main", shell=True, cwd=DATA_DIR) - subprocess.call("git reset --hard origin/main", shell=True, cwd=DATA_DIR) + subprocess.call("git fetch origin", shell=True, cwd=data_dir) + subprocess.call(f"git checkout {branch}", shell=True, cwd=data_dir) + subprocess.call(f"git reset --hard origin/{branch}", shell=True, cwd=data_dir) @dataclass @@ -191,9 +192,9 @@ def parse_tei_document(xml: ET.Element) -> Document: return Document(header=header_blob, sections=sections) -def add_document(engine, spec: Spec): +def add_document(engine, data_dir: str, spec: Spec): log(f"Writing text: {spec.slug}") - document_path = DATA_DIR / "1_sanskr" / "tei" / spec.filename + document_path = data_dir / spec.filename delete_existing_text(engine, spec.slug) with Session(engine) as session: @@ -227,13 +228,13 @@ def add_document(engine, spec: Spec): def run(): log("Downloading the latest data ...") - fetch_latest_data() + fetch_latest_data(REPO, BRANCH, DATA_DIR) log("Initializing database ...") engine = create_db() for spec in ALLOW: - add_document(engine, spec) + add_document(engine, DATA_DIR / "1_sanskr" / "tei", spec) log("Done.") diff --git a/ambuda/seed/texts/sarit.py b/ambuda/seed/texts/sarit.py new file mode 100644 index 00000000..d42598eb --- /dev/null +++ b/ambuda/seed/texts/sarit.py @@ -0,0 +1,28 @@ +"""Parse Sanskrit texts from SARIT. + +""" +from ambuda.seed.texts.gretil import * + + +REPO = "https://github.com/sarit/SARIT-corpus.git" +BRANCH = "master" +PROJECT_DIR = Path(__file__).resolve().parents[3] +DATA_DIR = PROJECT_DIR / "data" / "ambuda-sarit" + +ALLOW = [Spec("skandapuranam", "skandapurANam", "skandapurana.xml")] + + +def run(): + log("Downloading the latest data ...") + fetch_latest_data(REPO, BRANCH, DATA_DIR) + + log("Initializing database ...") + engine = create_db() + + for spec in ALLOW: + add_document(engine, DATA_DIR, spec) + log("Done.") + + +if __name__ == "__main__": + run() diff --git a/ambuda/templates/texts/index.html b/ambuda/templates/texts/index.html index 383fc399..348197d4 100644 --- a/ambuda/templates/texts/index.html +++ b/ambuda/templates/texts/index.html @@ -35,6 +35,7 @@