-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtest_class_tpa.py
151 lines (137 loc) · 4.75 KB
/
test_class_tpa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
import json
import twi_stat_to_panda as tpa
# import twi_stat_to_panda_p as tpp
import pandas as pd
import glob
import sys
import os
def create_df(fildir,selt,outname,write_csv=False,multihas=False,r_or_p='ruby',verbose=True,seen=set(),htadds=None):
# print fildir
# print selt
# print outname
os.chdir(fildir)
json_files = glob.glob('*.json')
if len(json_files) == 0:
raise RuntimeError('No dump files to convert.')
if verbose:
print json_files
lol=[]
# seen=set()
pdf=None
# fop = open('untitled.json')
for filna in json_files:
cos=400000
tem=cos
fop = open(filna)
# fop = open('/home/sergios-len/Documents/W5/Palestine_israel/06/tweets_palestine_abbas_netanyahu_israel_gaza_hash.json')
# fop =open('/home/sergios-len/Documents/W5/Palestine_israel/01/gaza_palestine_hamas_Operation%20Cast%20Lead_israel_hash.json')
# fop=open('/home/mab/MEGA/ObamaCare_18102013.json')
u=0
for fo in fop:
try:
dici=json.loads(fo)
# if selt=='p':
# print dici
# print type(dici)
# dici=json.loads(dici)
except Exception,e:
# print e
continue
if selt=='r':
h,lolo=tpa.TweetToPandas(dici,r_or_p).as_dict_hash()
elif selt=='p':
h,lolo=tpp.TweetToPandas(dici,r_or_p).as_dict_hash()
elif selt=='rr':
h,lolo=tpa.TweetToPandas(dici,r_or_p).users_as_dict_hash()
seene=dici.get('id',None)
if multihas:
# h,lolo=tpa.TweetToPandas(dici,r_or_p).hsa_as_dic_hash()
h,lolo=tpa.TweetToPandas(dici,r_or_p).users_as_dict_hash()
# print h
# print lolo
# print seene
if h == False or seene in seen:
continue
# print aaa
# nlolo=dict(lolo)
for hasht in h['hashtags']:
if htadds==None:
nlolo=dict(lolo)
nlolo['Hashtag']=hasht#.encode('utf-8')
lol.append(nlolo)
# print 'none'
elif hasht in htadds:
nlolo=dict(lolo)
nlolo['Hashtag']=hasht#.encode('utf-8')
lol.append(nlolo)
# print hasht,htadds,nlolo
# lolo['hashtags_list']=list(h['hashtags'])
elif not multihas:
if h == False or seene in seen:
continue
# print len(lolo)
# print lolo
# ppd=pd.DataFrame(lolo)
lol.append(lolo)
#
# print len(seen)
if seene != None:
seen.add(seene)
if u >=tem:
print u
# break
# print aaaaa
tem +=cos
if isinstance(pdf,pd.DataFrame):#==None:
ppd=pd.DataFrame(lol)
pdf=pd.concat([pdf,ppd],ignore_index=True)
lol=[]
else:
pdf=pd.DataFrame(lol)
lol=[]
# print pdf.info()
u+=1
# if pdf==None:
# pdf=ppd
# else:
# pdf.append(ppd)#,ignore_index=True)
# print aaaaaa
if verbose:
print filna
print len(lol)
# print lol[-1]
# print lol[-2]
# print aaaa
# pdf=pd.DataFrame(lol)
if isinstance(pdf,pd.DataFrame):#==None:
ppd=pd.DataFrame(lol)
pdf=pd.concat([pdf,ppd],ignore_index=True)
lol=[]
else:
pdf=pd.DataFrame(lol)
lol=[]
print pdf.columns
# pdf['created_at']=pd.to_datetime(pdf['created_at'],format='%a %b %d %H:%M:%S +0000 %Y')
if write_csv:
pdf.to_csv(outname,header=True)
return pdf,json_files
#
#
if __name__ == '__main__':
print 'test_class_tpa is being run by itself'
print sys.argv
fildir=sys.argv[1]
selt=sys.argv[2]
outname=sys.argv[3]
os.chdir(fildir)
# json_files = glob.glob('*.json')
# if len(json_files) == 0:
# raise RuntimeError('No dump files to convert.')
create_df(fildir,selt,outname+'out.ccc')
else:
print 'I am being imported from another module'
# filedir='/home/sergios-len/MEGAsync Downloads/' #refugees_dic.json'
# selt='p'
# outname='/home/sergios-len/MEGAsync Downloads/'
# create_df(filedir, selt, outname)