Skip to content

Commit

Permalink
Porting mediacloud submodule to tabular records
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Dec 12, 2023
1 parent 82ca4cb commit 92be4a7
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 203 deletions.
11 changes: 3 additions & 8 deletions minet/cli/mediacloud/medias.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,11 @@
from minet.cli.utils import with_enricher_and_loading_bar
from minet.cli.mediacloud.utils import with_mediacloud_fatal_errors
from minet.mediacloud import MediacloudAPIClient
from minet.mediacloud.constants import (
MEDIACLOUD_MEDIA_CSV_HEADER,
MEDIACLOUD_FEED_CSV_HEADER,
)
from minet.mediacloud.types import MediacloudFeed, MediacloudMedia


def get_headers(cli_args):
headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:]
headers = MediacloudMedia.fieldnames()[1:]

if cli_args.feeds is not None:
headers.append("feeds")
Expand All @@ -32,9 +29,7 @@ def action(cli_args, enricher, loading_bar):
feeds_writer = None

if cli_args.feeds:
feeds_writer = casanova.writer(
cli_args.feeds, fieldnames=MEDIACLOUD_FEED_CSV_HEADER
)
feeds_writer = casanova.writer(cli_args.feeds, fieldnames=MediacloudFeed)

client = MediacloudAPIClient(cli_args.token)

Expand Down
4 changes: 2 additions & 2 deletions minet/cli/mediacloud/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
from minet.cli.loading_bar import LoadingBar
from minet.cli.mediacloud.utils import with_mediacloud_fatal_errors
from minet.mediacloud import MediacloudAPIClient
from minet.mediacloud.constants import MEDIACLOUD_STORIES_CSV_HEADER
from minet.mediacloud.types import MediacloudStory


@with_mediacloud_fatal_errors
def action(cli_args):
writer = casanova.writer(cli_args.output, fieldnames=MEDIACLOUD_STORIES_CSV_HEADER)
writer = casanova.writer(cli_args.output, fieldnames=MediacloudStory)

client = MediacloudAPIClient(cli_args.token)

Expand Down
6 changes: 2 additions & 4 deletions minet/cli/mediacloud/topic/stories.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@

from minet.cli.loading_bar import LoadingBar
from minet.mediacloud import MediacloudAPIClient
from minet.mediacloud.constants import MEDIACLOUD_TOPIC_STORIES_CSV_HEADERS
from minet.mediacloud.types import MediacloudTopicStory


def action(cli_args):
writer = casanova.writer(
cli_args.output, fieldnames=MEDIACLOUD_TOPIC_STORIES_CSV_HEADERS
)
writer = casanova.writer(cli_args.output, fieldnames=MediacloudTopicStory)

with LoadingBar(title="Fetching stories", unit="stories") as loading_bar:
client = MediacloudAPIClient(cli_args.token)
Expand Down
6 changes: 3 additions & 3 deletions minet/mediacloud/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from minet.mediacloud.utils import make_simple_call
from minet.mediacloud.search import mediacloud_search
from minet.mediacloud.topic import mediacloud_topic_stories
from minet.mediacloud.formatters import format_media, format_feed
from minet.mediacloud.types import MediacloudFeed, MediacloudMedia


class MediacloudAPIClient(object):
Expand Down Expand Up @@ -38,7 +38,7 @@ def media(self, media_id, **kwargs):
self.pool_manager,
self.token,
"/media/single",
format_media,
MediacloudFeed.from_payload,
arg=media_id,
single=True,
**kwargs
Expand All @@ -49,7 +49,7 @@ def feeds(self, media_id, **kwargs):
self.pool_manager,
self.token,
"/feeds/list",
format_feed,
MediacloudMedia.from_payload,
query={"media_id": media_id, "rows": 100},
**kwargs
)
69 changes: 0 additions & 69 deletions minet/mediacloud/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,75 +6,6 @@
#
from urllib3 import Timeout


MEDIACLOUD_API_BASE_URL = "https://api.mediacloud.org/api/v2"
MEDIACLOUD_DEFAULT_TIMEOUT = Timeout(connect=30, read=60 * 5)
MEDIACLOUD_DEFAULT_BATCH = 250

MEDIACLOUD_TOPIC_STORIES_CSV_HEADERS = [
"guid",
"stories_id",
"title",
"url",
"language",
"media_id",
"media_name",
"collect_date",
"publish_date",
"date_is_reliable",
"facebook_share_count",
"full_text_rss",
"inlink_count",
"outlink_count",
"media_inlink_count",
"post_count",
"snapshots_id",
"timespans_id",
"next_link_id",
]

MEDIACLOUD_STORIES_CSV_HEADER = [
"guid",
"stories_id",
"processed_stories_id",
"title",
"url",
"language",
"collect_date",
"publish_date",
"media_id",
"media_name",
"media_url",
"tags",
"tag_sets",
"tags_ids",
"tag_sets_ids",
]

MEDIACLOUD_MEDIA_CSV_HEADER = [
"media_id",
"media_name",
"media_url",
"is_healthy",
"is_monitored",
"public_notes",
"num_stories_90",
"num_sentences_90",
"start_date",
"tags",
"tag_sets",
"tags_ids",
"tag_sets_ids",
]

MEDIACLOUD_FEED_CSV_HEADER = [
"name",
"url",
"feeds_id",
"type",
"media_id",
"active",
"last_attempted_download_time",
"last_new_story_time",
"last_successful_download_time",
]
112 changes: 0 additions & 112 deletions minet/mediacloud/formatters.py

This file was deleted.

4 changes: 2 additions & 2 deletions minet/mediacloud/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from minet.web import request
from minet.mediacloud.constants import MEDIACLOUD_API_BASE_URL, MEDIACLOUD_DEFAULT_BATCH
from minet.mediacloud.exceptions import MediacloudServerError
from minet.mediacloud.formatters import format_story
from minet.mediacloud.types import MediacloudStory
from minet.mediacloud.utils import get_last_processed_stories_id


Expand Down Expand Up @@ -145,7 +145,7 @@ def generator():

for story in data:
if not raw:
story = format_story(story)
story = MediacloudStory.from_payload(story)

yield story

Expand Down
4 changes: 2 additions & 2 deletions minet/mediacloud/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from minet.web import request
from minet.mediacloud.constants import MEDIACLOUD_API_BASE_URL, MEDIACLOUD_DEFAULT_BATCH
from minet.mediacloud.utils import get_next_link_id
from minet.mediacloud.formatters import format_topic_story
from minet.mediacloud.types import MediacloudTopicStory


def url_forge(
Expand Down Expand Up @@ -60,7 +60,7 @@ def mediacloud_topic_stories(

for story in data["stories"]:
if not raw:
story = format_topic_story(story, next_link_id)
story = MediacloudTopicStory.from_payload(story, next_link_id)

yield story

Expand Down
Loading

0 comments on commit 92be4a7

Please sign in to comment.