-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrainpicking.py
99 lines (88 loc) · 3.16 KB
/
brainpicking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from bs4 import BeautifulSoup
from bs4 import NavigableString
import bs4
import urllib2
import sys
from Tkinter import *
a=raw_input("WELCOME TO BRAINPICKING TEXUAL MODE \nEnter URL:- ")
try:
webpage = urllib2.urlopen(a)
soup = bs4.BeautifulSoup(webpage.read().decode('utf8'))
except:
print ("Cant find url")
i=0 #text i find appears twice so to print only once
m=0 # same as varible i
a=["Heading :-","Author :-","\n"]
img=[] # to contain all the images that appears on the page
def text(tag):
#find the data inside the tag
string=""
if isinstance(tag,NavigableString)==True:
string=tag.encode("utf-8")
return string
for tags in tag.descendants:
if isinstance(tags,NavigableString)==True:
string=string+tags.encode("utf-8")
elif tags.name=="img":
return(" ")
return string
import pprint
import time
flag=0 #for marking the end
flag1=0 #for marking quotes
fil="" #for storing the data
head=""
maindata=soup.findAll("div")
for data in maindata:
#search for data in div tags
if isinstance(data,NavigableString)==False:
for atrs in data.attrs:
if atrs=="class":
if data['class']==['holder']:
i=i+1
if i==1:
continue
for item in data.contents:
string=""
m=m+1
if (m%2==1):
continue
if isinstance(item,NavigableString)==False:
if (item.name=="div"):
for atr in item.attrs:
if atr=="class":
if item['class']==['callout']:
flag=1
break
if (item.name=="blockquote"):
flag1=1
pp = pprint.PrettyPrinter(indent=1)
if flag==1:
break
if flag1==1:
flag1=0
string=string+"QUOTE:"
pp = pprint.PrettyPrinter(indent=4)
if (m<7):
pp = pprint.PrettyPrinter(indent=4)
string=string+ str(a[min(m/2-1,2)])
i=i+1
if text(item)==" ":
continue
string=string+text(item)
if m==2:
head=text(item)
print ((string))
fil=fil+string+"\n"
time.sleep(2 * len(string) / 50 )
#raw_input()
print "IMGs:"
imgs=soup.findAll("img")
for imgss in imgs :
img.append(imgss['src'])
print img
fil = fil + "\n Download Images at :- ".encode('utf-8') + str(img).encode('utf-8')
#saving the data inside a file with the name of the heading
fo = open(head+".txt","w")
fo.write(fil.decode("utf-8").encode("ascii","ignore"))
fo.close