-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrssScrapeMD3.py
executable file
·261 lines (224 loc) · 10.5 KB
/
rssScrapeMD3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#! /usr/bin/python3
# Import libraries
import datetime
import os
import feedparser
import lxml.html
import urllib
from rich import print
from rich.console import Console
import time
import ssl
from jinja2 import Template
dateGenerated = datetime.datetime.now().strftime("%d/%m/%Y")
# Time the processing
startTime = time.time()
# Clear the screen whenever the script is run
console = Console()
console.clear()
# Variables to store RSS feed URI and path to mkdocs folder
feedLink = "https://latenightlinux.com/feed/mp3"
basePath = '.'
showSlug = 'LateNightLinuxMkDocsV3/docs'
confFilePath = 'LateNightLinuxMkDocsV3/mkdocs.yml'
buildCmd = './buildSite.sh'
# Open template files
with open("templates/indexTemplate.md.j2") as f:
indexTemplate = Template(f.read())
with open("templates/discoveriesTemplate.md.j2") as g:
discoveriesTemplate = Template(g.read())
with open("templates/rssLinkTemplate.md.j2") as h:
rssLinkTemplate = Template(h.read())
# List all currently generated MD files to determine if all episodes need to be processed
def listMdFiles():
mdFiles = []
dirList = os.listdir(os.path.join(basePath, showSlug))
for dirObject in dirList:
if os.path.isdir(os.path.join(basePath, showSlug, dirObject)):
fileList = os.listdir(os.path.join(basePath, showSlug, dirObject))
for file in fileList:
if os.path.isfile(os.path.join(basePath, showSlug, dirObject, file)):
if os.path.splitext(os.path.join(basePath, showSlug, dirObject, file))[1] == ".md":
mdFiles.append(os.path.splitext(file)[0])
return mdFiles
# Generate, from the site's HTML a string to represent the title and one to represent the meta description contents
def readMetaAndTitle(uri):
print("URI: ", uri)
data = ""
# Load the HTML from the defined uri
try:
req = urllib.request.Request(uri, data=None, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
data = urllib.request.urlopen(req)
data = data.read().decode("utf-8")
except Exception as error:
print(f"[red]\t\t\tError opening: {error} - {uri}")
return {"title": "", "description": ""}
# Parse the HTML using the lxml libraries
try:
pageHtml = lxml.html.fromstring(data)
except:
# Reencode if the previous step fails
data = urllib.request.urlopen(req)
data = data.read().decode().encode()
pageHtml = lxml.html.fromstring(data)
# Return the titles and format into a string
titles = pageHtml.xpath("//head//title")
titleString = ""
for title in titles:
if type(title.text) == type(""):
titleString += title.text.strip().replace("\n", " - ")
# Return the meta tags with the attribute name of description
metaDescriptions = pageHtml.xpath("//meta[@name = 'description']")
metaDescriptionString = ""
for metaDescription in metaDescriptions:
if "content" in metaDescription.attrib and metaDescription.attrib["content"] != "":
if type(metaDescription.attrib["content"]) == type(""):
tempString = metaDescription.attrib["content"].replace(
"\n", " - ").replace(
"\r", " - ").replace(
"\r\n", " - ").replace(
"\n\r", " - ")
metaDescriptionString += tempString
return {"title": titleString, "description": metaDescriptionString}
def processDiscoveries(paragraph):
discoLinkList = []
print(paragraph.getnext())
links = paragraph.getchildren()
print(links)
for child in links:
if child.tag == "a":
discoveryText = child.text
discoveryLink = child.attrib["href"]
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
discoLink = {"text": discoveryText, "link": discoveryLink,
"linkTitle": discoveryDetails["title"], "linkMetaDescription": discoveryDetails["description"]}
return discoLinkList
# Load the RSS feed and create an empty dictionary and list to store episode details
feed = feedparser.parse(feedLink)
episodeAndLinks = {}
episodes = []
print("[yellow]Calculating already processed episodes...")
processedEpisodes = listMdFiles()
# Write the index file and include a modification date
print("[yellow]Writing index file...")
output = indexTemplate.render(
{
"dateGenerated": dateGenerated,
}
)
with open(os.path.join(basePath, showSlug, 'index.md'), "w") as f:
f.write(output)
f.close()
# Iterate through each episode and work out which ones have discoveries
# detail the discoveries and add to a list / dictionary
print("[yellow]Iterating through episodes...")
count = 0
for episode in feed.entries:
discoLinkList = []
episodeName = episode.title
episodeLink = episode.link
print(f"[blue]\t{episode.title}")
# Ignore if the episode has already got an MD file associated with it
if episodeName in processedEpisodes:
print("[green]\t\tAlready processed. Ignoring")
else:
# Process episodes if an MD file does not exist for it
episodePublished = datetime.datetime.strptime(
episode.published, "%a, %d %b %Y %H:%M:%S +0000")
episodePublishedString = datetime.datetime.strptime(
episode.published, "%a, %d %b %Y %H:%M:%S +0000").strftime("%d/%m/%Y")
episodePublishedTimeString = datetime.datetime.strptime(
episode.published, "%a, %d %b %Y %H:%M:%S +0000").strftime("%H:%M:%S")
# Find the rows in the encoded content that referencies <strong>Discoveries and the next tag of strong
pageHtml = lxml.html.fromstring(episode.content[0].value)
paragraphs = pageHtml.xpath("//p")
lowCount = -1
highCount = -1
counter = 0
print(f"[green]\t\tFinding discoveries")
for paragraph in paragraphs:
if len(paragraph) > 0:
paragraph = paragraph.getchildren()[0]
if paragraph.tag == "strong" or paragraph.tag == "b":
#print(paragraph.text)
if type(paragraph.text) == type("") and ('Discoveries' in paragraph.text or 'Discovery' in paragraph.text):
lowCount = counter
# discoLinkList = processDiscoveries(paragraph)
# pass
elif lowCount > -1:
highCount = counter
break
counter += 1
# Now print discoveries, using the values from the previous loop
print(f"[green]\t\tWorking out details from the link")
for i in range(lowCount, highCount):
a = paragraphs[i].getchildren()
for child in a:
if child.tag == "a":
discoveryText = child.text
discoveryLink = child.attrib["href"]
if discoveryText != "contact page":
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
discoLink = {"text": discoveryText, "link": discoveryLink,
"linkTitle": discoveryDetails["title"], "linkMetaDescription": discoveryDetails["description"]}
discoLinkList.append(discoLink)
if len(discoLinkList) > 0:
episodes.append({'episodeName': episodeName, 'episodeLink': episodeLink, 'episodePublished': episodePublished,
'episodePublishedString': episodePublishedString, 'episodePublishedTimeString': episodePublishedTimeString, 'discoLinkList': discoLinkList})
# Now, write some files into a directory structure, detailing the links inside
# Create the base directory if it doesn't exist
if not (os.path.isdir(os.path.join(basePath, showSlug))):
os.mkdir(os.path.join(basePath, showSlug))
# Create the rss directory if it doesn't exist
if not (os.path.isdir(os.path.join(basePath, showSlug, 'rss'))):
os.mkdir(os.path.join(basePath, showSlug, 'rss'))
# Create the rss directory .pages file if it doesn't exist
if not (os.path.isfile(os.path.join(basePath, showSlug, 'rss', '.pages'))):
f = open(os.path.join(basePath, showSlug, 'rss', '.pages'), 'w')
f.write("hide: true")
f.close()
os.mkdir(os.path.join(basePath, showSlug, 'rss'))
print("[yellow]Writing MD files and directories")
for episode in episodes:
# Create a folder for each year within the feed
if not (os.path.isdir(os.path.join(basePath, showSlug, str(episode['episodePublished'].year)))):
os.mkdir(os.path.join(basePath, showSlug,
str(episode['episodePublished'].year)))
# Create a file for each episode
output = discoveriesTemplate.render(
{
"episode": episode,
}
)
with open(os.path.join(basePath, showSlug, str(
episode['episodePublished'].year), episode['episodeName']+'.md'), "w") as f:
f.write(output)
print('[red]\tWritten file for...', episode['episodeName'])
# Now generate the RSS links
print('[yellow]Writing files for RSS feed')
for episode in episodes:
for discovery in episode["discoLinkList"]:
output = rssLinkTemplate.render(
{
"episode": episode,
"discovery": discovery
}
)
backslashChar = "\'"
with open(os.path.join(basePath, showSlug, 'rss', f'{episode["episodeName"]} - {discovery["text"].replace("/","").replace(backslashChar,"").replace("&","")}.md'), "w") as f:
f.write(output)
print('[red]\tWritten file for...', episode['episodeName'], '-',
discovery["text"].replace("/", "").replace(backslashChar, "").replace("&", ""), '.md')
print('[yellow]Generating site...')
os.system(buildCmd)
endTime = time.time()
print(f"Time taken to run: {round(endTime-startTime,0)}s")