-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrssScrapeMD2.py
executable file
·246 lines (217 loc) · 10.2 KB
/
rssScrapeMD2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#! /usr/bin/python3
# Import libraries
import datetime
import os
import feedparser
import lxml.html
import urllib
from rich import print
from rich.console import Console
import time
import ssl
# Time the processing
startTime = time.time()
# Clear the screen whenever the script is run
console = Console()
console.clear()
# Variables to store RSS feed URI and path to mkdocs folder
feedLink = "https://latenightlinux.com/feed/mp3"
basePath = '.'
showSlug = 'LateNightLinuxMkDocsV2/docs'
confFilePath = 'LateNightLinuxMkDocsV2/mkdocs.yml'
buildCmd = './buildSite.sh'
# List all currently generated MD files to determine if all episodes need to be processed
def listMdFiles():
mdFiles = []
dirList = os.listdir(os.path.join(basePath, showSlug))
for dirObject in dirList:
if os.path.isdir(os.path.join(basePath, showSlug, dirObject)):
fileList = os.listdir(os.path.join(basePath, showSlug, dirObject))
for file in fileList:
if os.path.isfile(os.path.join(basePath, showSlug, dirObject, file)):
if os.path.splitext(os.path.join(basePath, showSlug, dirObject, file))[1] == ".md":
mdFiles.append(os.path.splitext(file)[0])
return mdFiles
# Generate, from the site's HTML a string to represent the title and one to represent the meta description contents
def readMetaAndTitle(uri):
# Load the HTML from the defined uri
try:
req = urllib.request.Request(uri, data=None, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
data = urllib.request.urlopen(req)
data = data.read().decode("utf-8")
except urllib.error.HTTPError as error:
print(f"[red]\t\t\tError opening: {error} - {uri}")
return {"title": "", "description": ""}
except urllib.error.URLError as error:
print(f"[red]\t\t\tError opening: {error} - {uri}")
return {"title": "", "description": ""}
except ssl.SSLError as error:
print(f"[red]\t\t\tError opening: {error} - {uri}")
return {"title": "", "description": ""}
# Parse the HTML using the lxml libraries
pageHtml = lxml.html.fromstring(data)
# Return the titles and format into a string
titles = pageHtml.xpath("//head//title")
titleString = ""
for title in titles:
if type(title.text) == type(""):
titleString += title.text.strip().replace("\n", " - ")
# Return the meta tags with the attribute name of description
metaDescriptions = pageHtml.xpath("//meta[@name = 'description']")
metaDescriptionString = ""
for metaDescription in metaDescriptions:
if "content" in metaDescription.attrib and metaDescription.attrib["content"] != "":
if type(metaDescription.attrib["content"]) == type(""):
tempString = metaDescription.attrib["content"].replace(
"\n", " - ").replace(
"\r", " - ").replace(
"\r\n", " - ").replace(
"\n\r", " - ")
metaDescriptionString += tempString
return {"title": titleString, "description": metaDescriptionString}
def processDiscoveries(paragraph):
discoLinkList = []
print(paragraph.getnext())
links = paragraph.getchildren()
print(links)
for child in links:
if child.tag == "a":
discoveryText = child.text
discoveryLink = child.attrib["href"]
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
discoLink = {"text": discoveryText, "link": discoveryLink,
"linkTitle": discoveryDetails["title"], "linkMetaDescription": discoveryDetails["description"]}
return discoLinkList
# Load the RSS feed and create an empty dictionary and list to store episode details
feed = feedparser.parse(feedLink)
episodeAndLinks = {}
episodes = []
print("[yellow]Calculating already processed episodes...")
processedEpisodes = listMdFiles()
# Write the index file and include a modification date
print("[yellow]Writing index file...")
indexFile = open(os.path.join(basePath, showSlug, 'index.md'), "w")
indexFile.write("# Late Night Linux Discoveries"+os.linesep)
indexFile.write(os.linesep)
indexFile.write(
"Please use the links in the menu to view discoveries from each of the relevant episodes."+os.linesep)
indexFile.write(os.linesep)
indexFile.write("Generated on: " +
datetime.datetime.now().strftime("%d/%m/%Y"))
indexFile.close()
# Rewrite the mkdocs.yml file to change the site version
# Read in all lines and amend the version
confFile = open(confFilePath, 'r')
confLines = []
for line in confFile:
if 'version:' in line:
# Process the line
updatedLine = f' version: {datetime.datetime.now().strftime("%Y-%m-%d")}'
confLines.append(updatedLine)
else:
confLines.append(line)
confFile.close()
# Open the file and write the lines
confFile = open(confFilePath, "w")
for line in confLines:
confFile.write(line)
confFile.close()
# Iterate through each episode and work out which ones have discoveries
# detail the discoveries and add to a list / dictionary
print("[yellow]Iterating through episodes...")
count = 0
for episode in feed.entries:
discoLinkList = []
episodeName = episode.title
episodeLink = episode.link
print(f"[blue]\t{episode.title}")
# Ignore if the episode has already got an MD file associated with it
if episodeName in processedEpisodes:
print("[green]\t\tAlready processed. Ignoring")
else:
# Process episodes if an MD file does not exist for it
episodePublished = datetime.datetime.strptime(
episode.published, "%a, %d %b %Y %H:%M:%S +0000")
episodePublishedString = datetime.datetime.strptime(
episode.published, "%a, %d %b %Y %H:%M:%S +0000").strftime("%d/%m/%Y")
# Find the rows in the encoded content that referencies <strong>Discoveries and the next tag of strong
pageHtml = lxml.html.fromstring(episode.content[0].value)
paragraphs = pageHtml.xpath("//p")
lowCount = -1
highCount = -1
counter = 0
print(f"[green]\t\tFinding discoveries")
for paragraph in paragraphs:
if len(paragraph) > 0:
paragraph = paragraph.getchildren()[0]
if paragraph.tag == "strong":
if type(paragraph.text) == type("") and 'Discoveries' in paragraph.text:
lowCount = counter
# discoLinkList = processDiscoveries(paragraph)
# pass
elif lowCount > -1:
highCount = counter
break
counter += 1
# Now print discoveries, using the values from the previous loop
print(f"[green]\t\tWorking out details from the link")
for i in range(lowCount, highCount):
a = paragraphs[i].getchildren()
for child in a:
if child.tag == "a":
discoveryText = child.text
discoveryLink = child.attrib["href"]
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
if discoveryDetails["title"] == "" and discoveryDetails["description"] == "":
discoveryDetails = readMetaAndTitle(discoveryLink)
discoLink = {"text": discoveryText, "link": discoveryLink,
"linkTitle": discoveryDetails["title"], "linkMetaDescription": discoveryDetails["description"]}
discoLinkList.append(discoLink)
if len(discoLinkList) > 0:
episodes.append({'episodeName': episodeName, 'episodeLink': episodeLink, 'episodePublished': episodePublished,
'episodePublishedString': episodePublishedString, 'discoLinkList': discoLinkList})
# Now, write some files into a directory structure, detailing the links inside
# Create the base directory if it doesn't exist
if not (os.path.isdir(os.path.join(basePath, showSlug))):
os.mkdir(os.path.join(basePath, showSlug))
print("[yellow]Writing MD files and directories")
for episode in episodes:
# Create a folder for each year within the feed
if not (os.path.isdir(os.path.join(basePath, showSlug, str(episode['episodePublished'].year)))):
os.mkdir(os.path.join(basePath, showSlug,
str(episode['episodePublished'].year)))
# Create a file for each episode
fw = open(os.path.join(basePath, showSlug, str(
episode['episodePublished'].year), episode['episodeName']+'.md'), 'w')
# Write the contents to the MD files
# Write the header
fw.write("# " + episode['episodeName']+os.linesep)
# Add a link to the episode
fw.write("Episode Link: ["+episode['episodeLink'] +
"](" + episode['episodeLink']+") "+os.linesep)
# Add the release date
fw.write("Release Date: "+episode['episodePublishedString']+os.linesep)
# Add the discoveries title
fw.write("## Discoveries"+os.linesep+os.linesep)
# Add a table detailing all discoveries for the episode
fw.write(f'| Name and Link | Page Title | Page Description |{os.linesep}')
fw.write('| ----- | ----- | ----- |'+os.linesep)
for disco in episode['discoLinkList']:
fw.write(
f"| [{disco['text']}]({disco['link']}) | {disco['linkTitle']} | {disco['linkMetaDescription']} |{os.linesep}")
fw.write(os.linesep)
# Write the generated on information
fw.write("Generated on: " + datetime.datetime.now().strftime("%d/%m/%Y"))
fw.close()
print('[red]\tWritten file for...', episode['episodeName'])
print('[yellow]Generating site...')
os.system(buildCmd)
endTime = time.time()
print(f"Time taken to run: {round(endTime-startTime,0)}s")