pipeline work

ukclivecox · ukclivecox · commit 9392281c878c · 2015-08-08T19:02:23.000+01:00
diff --git a/external/predictor/python/seldon/fileutil.py b/external/predictor/python/seldon/fileutil.py
@@ -12,73 +12,58 @@
 
 class FileUtil:
 
+    def __init__(self, key = None, secret = None):
+        self.key = key
+        self.secret = secret
+
     def stream_decompress(self,stream):
         dec = zlib.decompressobj(16+zlib.MAX_WBITS)  # same as gzip module
         for chunk in stream:
             rv = dec.decompress(chunk)
             if rv:
                 yield rv
 
-    def stream_text(self,k,cl):
+    def stream_text(self,k,fn):
         unfinished = ""
         for data in k:
             data = unfinished + data
             lines = data.split("\n");
             unfinished = lines.pop()
             for line in lines:
-                cl.process(line)
+                fn(line)
 
-    def stream_gzip(self,k,cl):
+    def stream_gzip(self,k,fn):
         unfinished = ""
         for data in self.stream_decompress(k):
             data = unfinished + data
             lines = data.split("\n");
             unfinished = lines.pop()
             for line in lines:
-                cl.process(line)
+                fn(line)
 
-'''
-Local File Stream
-'''                   
-class LocalFileUtil(FileUtil):         
-    
     def getFolders(self,baseFolder,startDay,numDays):
         folders = []
         for day in range(startDay-numDays+1,startDay+1):
             folders.append(baseFolder+str(day)+"/*")
         return folders
 
 
-    def stream(self,folders,cl):
+    def stream_local(self,folders,fn):
         for folder in folders:
             for f in glob.glob(folder):
                 k = open(f,"r")
                 if f.endswith(".gz"):
-                    self.stream_gzip(k,cl)
+                    self.stream_gzip(k,fn)
                 else:
-                    self.stream_text(k,cl)
+                    self.stream_text(k,fn)
 
-
-
-    def copy(self,fromPath,toPath):
+    def copy_local(self,fromPath,toPath):
         print "copy ",fromPath,"to",toPath
         dir = os.path.dirname(toPath)
-        if not os.path.exists(dir):
+        if len(dir) > 0 and not os.path.exists(dir):
             os.makedirs(dir)
         copyfile(fromPath,toPath)
 
-'''
-AWS S3 File Stream
-'''            
-class S3FileUtil(FileUtil):
-
-    def __init__(self, key = None, secret = None):
-        self.key = key
-        self.secret = secret
-        if key:
-            self.conn = boto.connect_s3(key,secret)
-        else:
-            self.conn = boto.connect_s3()
 
     def getGlob(self,startDay,numDays):
         g = "{" + str(startDay)
@@ -87,16 +72,24 @@ def getGlob(self,startDay,numDays):
         g += "}"
         return g
 
-    def stream(self,bucket,prefix,cl):
+    def stream_s3(self,bucket,prefix,fn):
+        if self.key:
+            self.conn = boto.connect_s3(self.key,self.secret)
+        else:
+            self.conn = boto.connect_s3()
         b = self.conn.get_bucket(bucket)
         for k in b.list(prefix=prefix):
             print k.name
             if k.name.endswith(".gz"):
-                self.stream_gzip(k,cl)
+                self.stream_gzip(k,fn)
             else:
-                self.stream_text(k,cl)
+                self.stream_text(k,fn)
 
-    def copy(self,fromPath,bucket,path):
+    def copy_s3(self,fromPath,bucket,path):
+        if self.key:
+            self.conn = boto.connect_s3(self.key,self.secret)
+        else:
+            self.conn = boto.connect_s3()
         print fromPath, bucket, path
         b = self.conn.get_bucket(bucket)
         source_size = os.stat(fromPath).st_size
@@ -115,5 +108,69 @@ def copy(self,fromPath,bucket,path):
         # Finish the upload
         print "completing transfer to s3"
         mp.complete_upload()
-#        k = b.new_key(path)
-#        k.set_contents_from_filename(fromPath)
+
+    def download_s3(self,bucket,s3path,localPath):
+        if self.key:
+            self.conn = boto.connect_s3(self.key,self.secret)
+        else:
+            self.conn = boto.connect_s3()
+        print bucket, s3path, localPath
+        b = self.conn.get_bucket(bucket)
+        key = b.get_key(s3path)
+        key.get_contents_to_filename(localPath)
+
+    def stream(self,inputPath,fn):
+        if inputPath.startswith("s3n://"):
+            isS3 = True
+            inputPath = inputPath[6:]
+        elif inputPath.startswith("s3://"):
+            isS3 = True
+            inputPath = inputPath[5:]
+        else:
+            isS3 = False
+        if isS3:
+            print "AWS S3 input path ",inputPath
+            parts = inputPath.split('/')
+            bucket = parts[0]
+            prefix = inputPath[len(bucket)+1:]
+            self.stream_s3(bucket,prefix,fn)
+        else:
+            folders = [inputPath+"*"]
+            print "local input folders: ",folders
+            self.stream_local(folders,fn)
+
+    def upload(self,path,outputPath):
+        if outputPath.startswith("s3n://"):
+            noSchemePath = outputPath[6:]
+            isS3 = True
+        elif outputPath.startswith("s3://"):
+            noSchemePath = outputPath[5:]
+            isS3 = True
+        else:
+            isS3 = False
+        if isS3:
+            parts = noSchemePath.split('/')
+            bucket = parts[0]
+            opath = noSchemePath[len(bucket)+1:]
+            self.copy_s3(path,bucket,opath)
+        else:
+            self.copy_local(path,outputPath)
+
+    def download(self,fromPath,toPath):
+        if fromPath.startswith("s3n://"):
+            isS3 = True
+            fromPath = fromPath[6:]
+        elif fromPath.startswith("s3://"):
+            isS3 = True
+            fromPath = inputPath[5:]
+        else:
+            isS3 = False
+        if isS3:
+            print "AWS S3 input path ",fromPath
+            parts = fromPath.split('/')
+            bucket = parts[0]
+            prefix = fromPath[len(bucket)+1:]
+            self.download_s3(bucket,prefix,toPath)
+        else:
+            self.copy_local(fromPath,toPath)
+        
diff --git a/external/predictor/python/seldon/pipeline/pipelines.py b/external/predictor/python/seldon/pipeline/pipelines.py
@@ -0,0 +1,46 @@
+import seldon.fileutil as fu
+
+class Feature_transform(object):
+
+    def upload(self,fromPath,fromPath):
+        fu.save(fromPath,toPath)
+        
+    def download(self,fromPath,toPath):
+        fu.download(fromPath,toPath)
+
+    def save(self,folder):
+        print "no model to save"
+
+    def load(self,folder):
+        print "no model to load"
+
+
+class Pipeline(object):
+
+    def __init__(self,models_folder="./models"):
+        self.pipeline = []
+        self.models_folder = models_folder
+        self.objs = []
+
+    def add(self,feature_transform):
+        self.pipeline.append(feature_transform)
+
+    def process(self,line):
+        j = json.loads(line)
+        self.objs.append(j)
+
+    def getFeatures(self,location):
+        fu.stream(location,self.process)
+
+    def transform(self,featureLocation):
+        self.getFeatures(featuresLocation)
+        for ft in pipeline:
+            ft.load(self.models_folder)
+            objs = ft.transform(objs)
+
+    def fit_transform(self):
+        self.getFeatures(featuresLocation)
+        for ft in pipeline:
+            ft.fit(objs)
+            objs = ft.transform(objs)
+            ft.save(self.models_folder)
diff --git a/external/predictor/python/seldon/vw.py b/external/predictor/python/seldon/vw.py
@@ -115,49 +115,27 @@ def train(self,client,conf):
         self.create_vw(conf)
         self.features = conf.get('features',{})
         self.fns = conf.get('namespaces',{})
+        #stream data into vw
         inputPath = conf["inputPath"] + "/" + client + "/features/" + str(conf['day']) + "/"
         print "inputPath->",inputPath
         if inputPath.startswith("s3n://"):
             isS3 = True
-            inputPath = inputPath[6:]
         elif inputPath.startswith("s3://"):
             isS3 = True
-            inputPath = inputPath[5:]
         else:
             isS3 = False
-        if isS3:
-            fileUtil = S3FileUtil(self.awsKey,self.awsSecret)
-            print "AWS S3 input path ",inputPath
-            parts = inputPath.split('/')
-            bucket = parts[0]
-            prefix = inputPath[len(bucket)+1:]
-            fileUtil.stream(bucket,prefix,self)
-        else:
-            fileUtil = LocalFileUtil() 
-            folders = [inputPath+"*"]
-            print "local input folders: ",folders
-            fileUtil.stream(folders,self)
+        fileUtil = FileUtil(key=self.awsKey,secret=self.awsSecret)
+        fileUtil.stream(inputPath,self.process)
+        # save vw model
         self.vw2.save_model("./model")
         self.vw2.close()
         print "lines processed ",self.numLinesProcessed
-        # push model to output path on s3 or local
+        # copy models to final location
         outputPath = conf["outputPath"] + "/" + client + "/vw/" + str(conf["day"])
         print "outputPath->",outputPath
-        if outputPath.startswith("s3n://"):
-            isS3 = True
-        else:
-            isS3 = False
-        if isS3:
-            noSchemePath = outputPath[6:]
-            parts = noSchemePath.split('/')
-            bucket = parts[0]
-            path = noSchemePath[len(bucket)+1:]
-            fileUtil = S3FileUtil(self.awsKey,self.awsSecret)
-            fileUtil.copy("./model",bucket,path+"/model")
-            fileUtil.copy("./model.readable",bucket,path+"/model.readable")
-        else:
-            fileUtil = LocalFileUtil() 
-            fileUtil.copy("./model",outputPath+"/model")
-            fileUtil.copy("./model.readable",outputPath+"/model.readable")
+
+        fileUtil.upload("./model",outputPath+"/model")
+        fileUtil.upload("./model.readable",outputPath+"/model.readable")
+
         if "activate" in conf and conf["activate"]:
             self.activateModel(client,str(outputPath))
diff --git a/scripts/zookeeper/set-client-config.py b/scripts/zookeeper/set-client-config.py
@@ -35,11 +35,11 @@ def activateModel(args,folder,zk):
 
     for line in sys.stdin:
         line = line.rstrip()
-        parts = line.split('\t')
-        if len(parts) == 3 and not line.startswith("#"):
+        parts = line.split()
+        if not line.startswith("#"):
             clients = parts[0].split(',')
             node = parts[1]
-            value = parts[2]
+            value = " ".join(parts[2:])
             print "--------------------------"
             print parts[0],node,"->",value
             for client in clients: