12
12
13
13
class FileUtil :
14
14
15
+ def __init__ (self , key = None , secret = None ):
16
+ self .key = key
17
+ self .secret = secret
18
+
15
19
def stream_decompress (self ,stream ):
16
20
dec = zlib .decompressobj (16 + zlib .MAX_WBITS ) # same as gzip module
17
21
for chunk in stream :
18
22
rv = dec .decompress (chunk )
19
23
if rv :
20
24
yield rv
21
25
22
- def stream_text (self ,k ,cl ):
26
+ def stream_text (self ,k ,fn ):
23
27
unfinished = ""
24
28
for data in k :
25
29
data = unfinished + data
26
30
lines = data .split ("\n " );
27
31
unfinished = lines .pop ()
28
32
for line in lines :
29
- cl . process (line )
33
+ fn (line )
30
34
31
- def stream_gzip (self ,k ,cl ):
35
+ def stream_gzip (self ,k ,fn ):
32
36
unfinished = ""
33
37
for data in self .stream_decompress (k ):
34
38
data = unfinished + data
35
39
lines = data .split ("\n " );
36
40
unfinished = lines .pop ()
37
41
for line in lines :
38
- cl . process (line )
42
+ fn (line )
39
43
40
- '''
41
- Local File Stream
42
- '''
43
- class LocalFileUtil (FileUtil ):
44
-
45
44
def getFolders (self ,baseFolder ,startDay ,numDays ):
46
45
folders = []
47
46
for day in range (startDay - numDays + 1 ,startDay + 1 ):
48
47
folders .append (baseFolder + str (day )+ "/*" )
49
48
return folders
50
49
51
50
52
- def stream (self ,folders ,cl ):
51
+ def stream_local (self ,folders ,fn ):
53
52
for folder in folders :
54
53
for f in glob .glob (folder ):
55
54
k = open (f ,"r" )
56
55
if f .endswith (".gz" ):
57
- self .stream_gzip (k ,cl )
56
+ self .stream_gzip (k ,fn )
58
57
else :
59
- self .stream_text (k ,cl )
58
+ self .stream_text (k ,fn )
60
59
61
-
62
-
63
- def copy (self ,fromPath ,toPath ):
60
+ def copy_local (self ,fromPath ,toPath ):
64
61
print "copy " ,fromPath ,"to" ,toPath
65
62
dir = os .path .dirname (toPath )
66
- if not os .path .exists (dir ):
63
+ if len ( dir ) > 0 and not os .path .exists (dir ):
67
64
os .makedirs (dir )
68
65
copyfile (fromPath ,toPath )
69
66
70
- '''
71
- AWS S3 File Stream
72
- '''
73
- class S3FileUtil (FileUtil ):
74
-
75
- def __init__ (self , key = None , secret = None ):
76
- self .key = key
77
- self .secret = secret
78
- if key :
79
- self .conn = boto .connect_s3 (key ,secret )
80
- else :
81
- self .conn = boto .connect_s3 ()
82
67
83
68
def getGlob (self ,startDay ,numDays ):
84
69
g = "{" + str (startDay )
@@ -87,16 +72,24 @@ def getGlob(self,startDay,numDays):
87
72
g += "}"
88
73
return g
89
74
90
- def stream (self ,bucket ,prefix ,cl ):
75
+ def stream_s3 (self ,bucket ,prefix ,fn ):
76
+ if self .key :
77
+ self .conn = boto .connect_s3 (self .key ,self .secret )
78
+ else :
79
+ self .conn = boto .connect_s3 ()
91
80
b = self .conn .get_bucket (bucket )
92
81
for k in b .list (prefix = prefix ):
93
82
print k .name
94
83
if k .name .endswith (".gz" ):
95
- self .stream_gzip (k ,cl )
84
+ self .stream_gzip (k ,fn )
96
85
else :
97
- self .stream_text (k ,cl )
86
+ self .stream_text (k ,fn )
98
87
99
- def copy (self ,fromPath ,bucket ,path ):
88
+ def copy_s3 (self ,fromPath ,bucket ,path ):
89
+ if self .key :
90
+ self .conn = boto .connect_s3 (self .key ,self .secret )
91
+ else :
92
+ self .conn = boto .connect_s3 ()
100
93
print fromPath , bucket , path
101
94
b = self .conn .get_bucket (bucket )
102
95
source_size = os .stat (fromPath ).st_size
@@ -115,5 +108,69 @@ def copy(self,fromPath,bucket,path):
115
108
# Finish the upload
116
109
print "completing transfer to s3"
117
110
mp .complete_upload ()
118
- # k = b.new_key(path)
119
- # k.set_contents_from_filename(fromPath)
111
+
112
+ def download_s3 (self ,bucket ,s3path ,localPath ):
113
+ if self .key :
114
+ self .conn = boto .connect_s3 (self .key ,self .secret )
115
+ else :
116
+ self .conn = boto .connect_s3 ()
117
+ print bucket , s3path , localPath
118
+ b = self .conn .get_bucket (bucket )
119
+ key = b .get_key (s3path )
120
+ key .get_contents_to_filename (localPath )
121
+
122
+ def stream (self ,inputPath ,fn ):
123
+ if inputPath .startswith ("s3n://" ):
124
+ isS3 = True
125
+ inputPath = inputPath [6 :]
126
+ elif inputPath .startswith ("s3://" ):
127
+ isS3 = True
128
+ inputPath = inputPath [5 :]
129
+ else :
130
+ isS3 = False
131
+ if isS3 :
132
+ print "AWS S3 input path " ,inputPath
133
+ parts = inputPath .split ('/' )
134
+ bucket = parts [0 ]
135
+ prefix = inputPath [len (bucket )+ 1 :]
136
+ self .stream_s3 (bucket ,prefix ,fn )
137
+ else :
138
+ folders = [inputPath + "*" ]
139
+ print "local input folders: " ,folders
140
+ self .stream_local (folders ,fn )
141
+
142
+ def upload (self ,path ,outputPath ):
143
+ if outputPath .startswith ("s3n://" ):
144
+ noSchemePath = outputPath [6 :]
145
+ isS3 = True
146
+ elif outputPath .startswith ("s3://" ):
147
+ noSchemePath = outputPath [5 :]
148
+ isS3 = True
149
+ else :
150
+ isS3 = False
151
+ if isS3 :
152
+ parts = noSchemePath .split ('/' )
153
+ bucket = parts [0 ]
154
+ opath = noSchemePath [len (bucket )+ 1 :]
155
+ self .copy_s3 (path ,bucket ,opath )
156
+ else :
157
+ self .copy_local (path ,outputPath )
158
+
159
+ def download (self ,fromPath ,toPath ):
160
+ if fromPath .startswith ("s3n://" ):
161
+ isS3 = True
162
+ fromPath = fromPath [6 :]
163
+ elif fromPath .startswith ("s3://" ):
164
+ isS3 = True
165
+ fromPath = inputPath [5 :]
166
+ else :
167
+ isS3 = False
168
+ if isS3 :
169
+ print "AWS S3 input path " ,fromPath
170
+ parts = fromPath .split ('/' )
171
+ bucket = parts [0 ]
172
+ prefix = fromPath [len (bucket )+ 1 :]
173
+ self .download_s3 (bucket ,prefix ,toPath )
174
+ else :
175
+ self .copy_local (fromPath ,toPath )
176
+
0 commit comments