-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tool.py
35 lines (34 loc) · 1.18 KB
/
Tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#-*- coding:utf-8 -*-
import re
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>|</dd>|</dt>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将<替换为<
replaceLT = re.compile('<')
#将>替换为>
replaceGT = re.compile('>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
#删除展开全部
removeZKQB = re.compile('展开全部')
def replace(self,x):
x = re.sub(self.removeZKQB,"",x)
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()