-
Notifications
You must be signed in to change notification settings - Fork 1
/
ct_collector.py
122 lines (114 loc) · 6.28 KB
/
ct_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import time
import pandas as pd
from urllib.parse import urlparse
import datetime
import requests
### Custom function adapted from https://pypi.org/project/PyCrowdTangle/
def ct_get_search_posts(count=100, start_date=None, end_date=None, include_history=None,
sort_by="date", types=None, search_term=None, account_types=None,
min_interactions=0, offset=0, api_token=None, platforms="facebook,instagram"):
"""Retrieve a set of posts for the given parameters get post from crowdtangle
Args:
count (int, optional): The number of posts to return. Defaults to 10. options [1-100]
start_date (str, optional): The earliest date at which a post could be posted. Time zone is UTC.
Format is “yyyy-mm-ddThh:mm:ss” or “yyyy-mm-dd”
(defaults to time 00:00:00).
end_date (str, optional): The latest date at which a post could be posted.
Time zone is UTC. Format is “yyyy-mm-ddThh:mm:ss”
or “yyyy-mm-dd” (defaults to time 00:00:00).
defaults to "now".
include_history (str, optional): Includes timestep data for growth of each post returned.
Defaults to null (not included). options: 'true'
sort_by (str, optional): The method by which to filter and order posts.
options: 'date', 'interaction_rate', 'overperforming',
'total_interactions', 'underperforming'.
defaults 'overperforming'
min_interactions (int, optional): If set, will exclude posts with total interactions
below this threshold. options int > 0, default 0
offset (int, optional): The number of posts to offset (generally used for pagination).
Pagination links will also be provided in the response.
types (str, optional): The types of post to include. These can be separated by commas
to include multiple types. If you want all live videos
(whether currently or formerly live), be sure to include both
live_video and live_video_complete. The "video" type does not
mean all videos, it refers to videos that aren't native_video,
youtube or vine (e.g. a video on Vimeo).
options: "episode", "extra_clip", "link", "live_video",
"live_video_complete", "live_video_scheduled", "native_video",
"photo", "status", "trailer","video", "vine", "youtube"
default all
search_term (str, optional): Returns only posts that match this search term.
Terms AND automatically. Separate with commas for OR,
use quotes for phrases. E.g. CrowdTangle API -> AND.
CrowdTangle, API -> OR. "CrowdTangle API" -> AND in that
exact order. You can also use traditional Boolean search
with this parameter. default null
api_token (str, optional): you can locate your API token via your crowdtangle dashboard
under Settings > API Access.
Returns:
[dict]: The Response contains both a status code and a result. The status will always
be 200 if there is no error. The result contains an array of post objects and
a pagination object with URLs for both the next and previous page, if they exist
Example:
ct_get_posts(include_history = 'true', api_token="AKJHXDFYTGEBKRJ6535")
"""
# api-endpoint
URL_BASE = "https://api.crowdtangle.com/posts/search"
# defining a params dict for the parameters to be sent to the API
PARAMS = {'count': count, 'sortBy': sort_by, 'token': api_token,
'minInteractions': min_interactions, 'offset': offset}
# add params parameters
if start_date:
PARAMS['startDate'] = start_date
if end_date:
PARAMS['endDate'] = end_date
if include_history == 'true':
PARAMS['includeHistory'] = include_history
if types:
PARAMS['types'] = types
if account_types:
PARAMS['accountTypes'] = account_types
if search_term:
PARAMS['searchTerm'] = search_term
if platforms:
PARAMS["platforms"] = platforms
# sending get request and saving the response as response object
r = requests.get(url=URL_BASE, params=PARAMS)
# if r.status_code != 200:
# out = r.json()
# print(f"status: {out['status']}")
# print(f"Code error: {out['code']}")
# print(f"Message: {out['message']}")
return r.json()
api_token = "************" ## put your Crowdtangle key here
track = ["vaccini", "vaccino", "vaccinazioni", "vaccinazione", "vaccinarsi",
"vaccinare", "vacciniamoci", "vaccinerò", "vaccinerete"]
track = ','.join(track)
start = datetime.datetime.today() - datetime.timedelta(days=7)
start = start.strftime("%Y-%m-%d")
end = datetime.datetime.today().strftime('%Y-%m-%d')
while 1:
try:
results = ct_get_search_posts(api_token=api_token, include_history="true",
count=10000, start_date=start, end_date=end,
search_term=track, sort_by="date")
results = results["result"]["posts"]
count = 0
for post in results:
### do something with it ###
############################
L = results.__len__()
last_date = results[L - 1]["date"]
print(str(last_date) + " - " + str(end) + ": " + str(count))
## get last post date to continue the search forward (you get a duplicated post, but that is the only way)
if L == 10000:
L = results.__len__()
last_date = results[L - 1]["date"]
end = last_date
continue
else:
break
except Exception as e: # 6 calls/minute limit
print(e)
time.sleep(60)
continue