-
Notifications
You must be signed in to change notification settings - Fork 0
/
weibo_hot_list_serach.py
68 lines (58 loc) · 2.17 KB
/
weibo_hot_list_serach.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import json
import datetime
import pytz
from EuclidDataTools import CsvClient
import pandas as pd
from pathlib import Path
# 获取数据
Url = "https://weibo.com/ajax/side/hotSearch"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64)", "Host": "httpbin.org"}
response = requests.get(Url, timeout=60) # 使用request获取网页
data = response.content.decode("utf-8", "ignore") # 解码
# 记录时间
tz = pytz.timezone("Asia/Shanghai")
now_time = datetime.datetime.now(tz)
print(now_time)
year = now_time.year
month = now_time.month
day = now_time.day
hour = now_time.hour
min = now_time.minute
sec = now_time.second
# 保存数据
myCol = CsvClient(subFolder="hotlist", FileName=f"{year}_{month}_{day}.csv")
def _deal_sigle_hot_data(raw_data: dict) -> dict:
"""
处理单条热搜数据
:param raw_data: 从网页获取的原始数据
:return: 处理后的数据
"""
data = {}
try:
data["is_ad"] = raw_data["is_ad"]
return {}
except KeyError:
pass
data["word"] = raw_data["word"]
data["hot"] = raw_data["raw_hot"]
data["mid"] = raw_data["mid"]
return data
for i in json.loads(data)["data"]["realtime"]:
single_data = _deal_sigle_hot_data(i)
if len(single_data) != 0:
single_data["time"] = f"{year}-{month}-{day}:{hour}:{min}:{sec}"
myCol.insert_one(single_data)
# convert today's data for ranking
# 完全是为了符合js那边的格式, 因为我看不到js那边的代码, 所以只能这样了
example = pd.read_csv(Path("docs/example.csv"), encoding="gbk", low_memory=False)
data = pd.read_csv(myCol.FullFilePath, encoding="utf_8_sig")
data["time"] = pd.to_datetime(data["time"], format="%Y-%m-%d:%H:%M:%S")
for i in range(0, 24):
data_later_hour = data[data["time"] > pd.to_datetime(f"{year}-{month}-{day}:{i}:00:00", format="%Y-%m-%d:%H:%M:%S")]
example["name"] = data_later_hour["word"]
example["value"] = data_later_hour["hot"]
example["date"] = data_later_hour["time"].dt.strftime("%Y-%m-%d %H:%M")
example = example[~example["name"].isna()]
# save data
example.to_csv(Path(f"docs/{i}/ranking_data.csv"), index=False, encoding="utf-8-sig")