danjac · danjac · Oct 26, 2024 · Oct 28, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -7,7 +7,7 @@ jobs:
             - uses: actions/checkout@v4
             - uses: actions/setup-python@v5
               with:
-                  python-version: "3.12.6"
+                  python-version: "3.13.0"
                   cache: "pip"
             - uses: pre-commit/[email protected]
     unittests:
@@ -29,7 +29,7 @@ jobs:
                 cache-dependency-glob: |
                     **/uv.lock
                     **/pyproject.toml
-            - run: uv python install 3.12.6
+            - run: uv python install 3.13.0
             - run: uv sync --frozen --all-extras --no-install-project
             - run: uv run pyright
             - run: uv run xargs -I{} python -c "import nltk; nltk.download('{}')" < ./nltk.txt

diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # Install Python dependencies
-FROM python:3.12.6-bookworm AS python-base
+FROM python:3.13.0-bookworm AS python-base
 
 ENV LC_CTYPE=C.utf8 \
     PYTHONUNBUFFERED=1 \

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Radiofeed requires the following basic dependencies to get started:
 * Python 3.12
 * [uv](https://docs.astral.sh)
 
-**Note:** if you don't have the right version of Python you can use `uv python install 3.12.x`.
+**Note:** if you don't have the right version of Python you can use `uv python install 3.13.x`.
 
 For ease of local development a `docker-compose.yml` file is provided which includes Docker images:
 
@@ -28,7 +28,9 @@ The [justfile](https://github.com/casey/just) has some convenient shortcuts for
 * `just update`: update dependencies to latest available versions
 * `just clean`: remove all non-committed files and other artifacts
 * `just serve`: run the development server and Tailwind JIT compiler
-* `just test`: run the test suite
+* `just shell`: open a shell in the development environment
+* `just test`: run unit tests
+* `just check`: run unit tests and linters
 
 The install command will also create a `.env` file with default settings for local development, if one does not already exist.
 

diff --git a/justfile b/justfile
@@ -18,15 +18,15 @@ check:
 serve:
     ./manage.py tailwind runserver_plus
 
+shell:
+    ./manage.py shell_plus
+
 clean:
     git clean -Xdf
 
 test *args:
     pytest {{ args }}
 
-precommitall:
-    pre-commit run -a
-
 typecheck:
     pyright
 
@@ -48,5 +48,8 @@ precommmitinstall:
 precommitupdate:
 	pre-commit autoupdate
 
+precommitall:
+    pre-commit run -a
+
 nltkdownload:
     uv run xargs -I{} python -c "import nltk; nltk.download('{}')" < ./nltk.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ description = "Podcast aggregator app"
 authors = [
     {name = "Dan Jacob", email = "[email protected]"},
 ]
-requires-python = "==3.12.*"
+requires-python = "==3.13.*"
 readme = "README.md"
 license = {text = "MIT"}
 dependencies = [
@@ -93,6 +93,7 @@ testpaths = ["radiofeed", "templates"]
 env = [
     "COVERAGE_CORE=sysmon",
     "SECURE_SSL_REDIRECT=false",
+    "USE_CONNECTION_POOL=false",
     "USE_COLLECTSTATIC=false",
     "USE_X_FORWARDED_HOST=false",
 ]

diff --git a/radiofeed/feedparser/feed_parser.py b/radiofeed/feedparser/feed_parser.py
@@ -88,55 +88,39 @@ def parse(self, client: Client) -> None:
         response: httpx.Response | None = None
         try:
             response = self._get_response(client)
+
             content_hash = self._make_content_hash(response)
             self._check_duplicates(response, content_hash)
-            self._handle_update(
+
+            self._parse_ok(
                 response=response,
                 content_hash=content_hash,
                 feed=rss_parser.parse_rss(response.content),
             )
         except FeedParserError as exc:
-            self._handle_error(exc, response or exc.response)
-
-    def _make_content_hash(self, response: httpx.Response) -> str:
-        content_hash = make_content_hash(response.content)
-
-        # check content hash has changed
-        if content_hash == self._podcast.content_hash:
-            raise NotModifiedError
-
-        return content_hash
-
-    def _check_duplicates(self, response: httpx.Response, content_hash: str) -> None:
-        # check no other podcast with this RSS URL or identical content
-        if (
-            Podcast.objects.exclude(pk=self._podcast.pk)
-            .filter(Q(rss=response.url) | Q(content_hash=content_hash))
-            .exists()
-        ):
-            raise DuplicateError
+            self._parse_error(exc, response or exc.response)
 
-    def _handle_update(
+    def _parse_ok(
         self,
         *,
         response: httpx.Response,
         content_hash: str,
         feed: Feed,
     ) -> None:
-        categories, keywords = self._parse_taxonomy(feed)
+        categories_dct = get_categories()
 
         try:
             with transaction.atomic():
                 self._podcast_update(
                     num_retries=0,
                     parser_error="",
                     content_hash=content_hash,
-                    keywords=keywords,
                     rss=response.url,
                     active=not (feed.complete),
                     etag=self._parse_etag(response),
                     modified=self._parse_modified(response),
-                    extracted_text=self._extract_text(feed),
+                    keywords=self._parse_keywords(feed, categories_dct),
+                    extracted_text=self._tokenize_content(feed),
                     frequency=scheduler.schedule(feed),
                     **feed.model_dump(
                         exclude={
@@ -147,41 +131,20 @@ def _handle_update(
                     ),
                 )
 
-                self._podcast.categories.set(categories)
+                self._podcast.categories.set(
+                    self._parse_categories(feed, categories_dct)
+                )
+
                 self._episode_updates(feed)
+
                 self._logger.success("Feed updated")
         except DataError as exc:
             raise InvalidDataError from exc
 
-    def _get_response(self, client: Client) -> httpx.Response:
-        try:
-            try:
-                return client.get(self._podcast.rss, headers=self._get_headers())
-            except httpx.HTTPStatusError as exc:
-                if exc.response.is_redirect:
-                    raise NotModifiedError(response=exc.response) from exc
-                if exc.response.is_client_error:
-                    raise InaccessibleError(response=exc.response) from exc
-                raise
-        except httpx.HTTPError as exc:
-            raise UnavailableError from exc
-
-    def _parse_etag(self, response: httpx.Response) -> str:
-        return response.headers.get("ETag", "")
-
-    def _parse_modified(self, response: httpx.Response) -> datetime | None:
-        return parse_date(response.headers.get("Last-Modified"))
-
-    def _get_headers(self) -> dict[str, str]:
-        headers = {"Accept": self._accept_header}
-        if self._podcast.etag:
-            headers["If-None-Match"] = quote_etag(self._podcast.etag)
-        if self._podcast.modified:
-            headers["If-Modified-Since"] = http_date(self._podcast.modified.timestamp())
-        return headers
-
-    def _handle_error(
-        self, exc: FeedParserError, response: httpx.Response | None = None
+    def _parse_error(
+        self,
+        exc: FeedParserError,
+        response: httpx.Response | None = None,
     ) -> None:
         active: bool = True
         num_retries: int = self._podcast.num_retries
@@ -229,6 +192,51 @@ def _handle_error(
         # re-raise original exception
         raise exc
 
+    def _get_response(self, client: Client) -> httpx.Response:
+        try:
+            try:
+                return client.get(self._podcast.rss, headers=self._get_headers())
+            except httpx.HTTPStatusError as exc:
+                if exc.response.is_redirect:
+                    raise NotModifiedError(response=exc.response) from exc
+                if exc.response.is_client_error:
+                    raise InaccessibleError(response=exc.response) from exc
+                raise
+        except httpx.HTTPError as exc:
+            raise UnavailableError from exc
+
+    def _make_content_hash(self, response: httpx.Response) -> str:
+        content_hash = make_content_hash(response.content)
+
+        # check content hash has changed
+        if content_hash == self._podcast.content_hash:
+            raise NotModifiedError
+
+        return content_hash
+
+    def _check_duplicates(self, response: httpx.Response, content_hash: str) -> None:
+        # check no other podcast with this RSS URL or identical content
+        if (
+            Podcast.objects.exclude(pk=self._podcast.pk)
+            .filter(Q(rss=response.url) | Q(content_hash=content_hash))
+            .exists()
+        ):
+            raise DuplicateError
+
+    def _parse_etag(self, response: httpx.Response) -> str:
+        return response.headers.get("ETag", "")
+
+    def _parse_modified(self, response: httpx.Response) -> datetime | None:
+        return parse_date(response.headers.get("Last-Modified"))
+
+    def _get_headers(self) -> dict[str, str]:
+        headers = {"Accept": self._accept_header}
+        if self._podcast.etag:
+            headers["If-None-Match"] = quote_etag(self._podcast.etag)
+        if self._podcast.modified:
+            headers["If-Modified-Since"] = http_date(self._podcast.modified.timestamp())
+        return headers
+
     def _podcast_update(self, **fields) -> None:
         now = timezone.now()
 
@@ -238,34 +246,29 @@ def _podcast_update(self, **fields) -> None:
             **fields,
         )
 
-    def _parse_taxonomy(self, feed: Feed) -> tuple[list[Category], str]:
-        categories: list[Category] = []
-        keywords: str = ""
-
-        if category_names := {c.casefold() for c in feed.categories}:
-            categories_dct = get_categories()
-
-            categories = [
-                categories_dct[name]
-                for name in category_names
-                if name in categories_dct
-            ]
-
-            keywords = " ".join(
-                [name for name in category_names if name not in categories_dct]
-            )
+    def _parse_keywords(self, feed: Feed, categories_dct: dict[str, Category]) -> str:
+        return " ".join(
+            [value for value in feed.categories if value not in categories_dct]
+        )
 
-        return categories, keywords
+    def _parse_categories(
+        self, feed: Feed, categories_dct: dict[str, Category]
+    ) -> list[Category]:
+        return [
+            categories_dct[value]
+            for value in feed.categories
+            if value in categories_dct
+        ]
 
-    def _extract_text(self, feed: Feed) -> str:
+    def _tokenize_content(self, feed: Feed) -> str:
         text = " ".join(
             value
             for value in [
                 feed.title,
                 feed.description,
                 feed.owner,
             ]
-            + feed.categories
+            + list(feed.categories)
             + [item.title for item in feed.items][:6]
             if value
         )

diff --git a/radiofeed/feedparser/models.py b/radiofeed/feedparser/models.py
@@ -129,7 +129,7 @@ class Item(BaseModel):
     guid: str = Field(..., min_length=1)
     title: str = Field(..., min_length=1)
 
-    categories: list[str] = Field(default_factory=list)
+    categories: set[str] = Field(default_factory=set)
 
     description: EmptyIfNone = ""
     keywords: EmptyIfNone = ""
@@ -158,6 +158,12 @@ class Item(BaseModel):
         ),
     ] = DEFAULT_EPISODE_TYPE
 
+    @field_validator("categories", mode="after")
+    @classmethod
+    def validate_categories(cls, value: Any) -> set[str]:
+        """Ensure categories are unique and not empty."""
+        return {c.casefold() for c in set(filter(None, value))}
+
     @field_validator("pub_date", mode="before")
     @classmethod
     def validate_pub_date(cls, value: Any) -> datetime:
@@ -247,7 +253,7 @@ class Feed(BaseModel):
 
     items: list[Item]
 
-    categories: list[str] = Field(default_factory=list)
+    categories: set[str] = Field(default_factory=set)
 
     @field_validator("language", mode="before")
     @classmethod
@@ -257,6 +263,12 @@ def validate_language(cls, value: Any) -> str:
             value.casefold()[:2] if value and len(value) > 1 else cls.DEFAULT_LANGUAGE
         )
 
+    @field_validator("categories", mode="after")
+    @classmethod
+    def validate_categories(cls, value: Any) -> set[str]:
+        """Ensure categories are unique and not empty."""
+        return {c.casefold() for c in set(filter(None, value))}
+
     @field_validator("complete", mode="before")
     @classmethod
     def validate_complete(cls, value: Any) -> bool:

diff --git a/radiofeed/feedparser/tests/test_feed_parser.py b/radiofeed/feedparser/tests/test_feed_parser.py
@@ -158,9 +158,10 @@ def test_parse_ok(self, categories):
         assert podcast.description == "Blog and Podcast specializing in offbeat news"
         assert podcast.owner == "8th Kind"
 
-        assert (
-            podcast.extracted_text
-            == "mysterious universe blog specializing offbeat th kind science medicine science social science religion spirituality spirituality society culture philosophy mu tibetan zombie mu saber tooth tiger king mu kgb cop mu joshua cutchin timothy renner mu squid router mu jim bruton"
+        tokens = set(podcast.extracted_text.split())
+
+        assert tokens == set(
+            "mysterious universe blog specializing offbeat th kind science spirituality science medicine society culture philosophy social science religion spirituality mu tibetan zombie mu saber tooth tiger king mu kgb cop mu joshua cutchin timothy renner mu squid router mu jim bruton".split()
         )
 
         assert podcast.modified

diff --git a/radiofeed/feedparser/tests/test_models.py b/radiofeed/feedparser/tests/test_models.py
@@ -39,13 +39,13 @@ def test_length_valid(self):
 
     def test_default_keywords_from_categories(self):
         item = Item(**ItemFactory(categories=["Gaming", "Hobbies", "Video Games"]))
-        assert item.keywords == "Gaming Hobbies Video Games"
+        assert set(item.keywords.split()) == {"gaming", "hobbies", "video", "games"}
 
     def test_defaults(self):
         item = Item(**ItemFactory())
         assert item.explicit is False
         assert item.episode_type == "full"
-        assert item.categories == []
+        assert item.categories == set()
         assert item.keywords == ""
 
     @pytest.mark.parametrize(
@@ -102,5 +102,5 @@ def test_defaults(self, item):
         assert feed.explicit is False
         assert feed.language == "en"
         assert feed.description == ""
-        assert feed.categories == []
+        assert feed.categories == set()
         assert feed.pub_date == item.pub_date