fellowship · yevhen78 · Nov 25, 2015 · Nov 25, 2015 · Nov 25, 2015
diff --git a/pyrite.py b/pyrite.py
@@ -0,0 +1,303 @@
+import numpy
+import math
+import pandas
+import random
+import matplotlib.pyplot as plt
+import os
+from astroML.plotting import hist
+
+class Pyrite:
+
+
+    def __init__(self, dataset):
+        # Convert input dataset to datafraem and handle nulls
+        # Store dataset in self.df
+        self.df = pandas.DataFrame(dataset.copy())
+        self.mean = 0.0
+        self.std = 0.0
+
+
+    def auto_discretize(self,num_data,method,range_min_max):
+
+        """
+        Perform automatic discretization of a selected feature; a method
+        (bayesian blocks, scott method or fixed bin number) along the desired data range is passed to a special version of hist which gives cutpoints for discretization and returns the "categorized" version of the original data
+        """
+        hist_data = hist(num_data, bins=method,range=range_min_max)
+        plt.close('all')
+        leng = len(hist_data[1])
+        # fix cutoff to make sure outliers are properly categorized as well if necessary
+        hist_data[1][leng-1] = num_data.max()
+        #hist_data[1][0] = num_data.min()
+        # automatically assign category labels of '1','2',etc
+        cat_data = pandas.cut(num_data,hist_data[1],labels=range(1,leng),include_lowest='TRUE')
+        return pandas.Series(cat_data).astype(str)
+
+    def discretize(self, columns, method = 'blocks'):
+        """
+    Discretizes user provided list of numerical columns using auto_discretization function above. If a different method for discretization is desired, for a specific column range, this function should be called on a per column basis with parameters set for each particular column
+
+        Input:
+        columns: list of columns to dicretize.
+        method: default method is 'blocks' (Bayesian blocks), other options is 'scott' or fixed number of bins
+        plot: if True - plot histogram, default = False
+
+        Output:
+        None, changes columns in the self.df data frame.
+
+        """
+        print "\ndiscretizing numerical attributes ..."
+
+        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+
+
+        for col in columns:
+            if not(self.df[col].dtype in numerics):
+                err = 'Column ' + str(col) + ' is not numeric!'
+                raise Exception(err)
+
+        num_cols = len(columns)
+        for col in columns:
+            col_range = (self.df[col].min(),self.df[col].max())
+            self.df[col] = self.auto_discretize(self.df[col], method, range_min_max = col_range)
+
+
+
+    def compute_frequency(self,y, s0, s1):
+
+        """
+        Helper method - Compute Frequency of "y" in "diDF"
+
+        Input:
+        y: A single instance out of subsample
+        df: dataset
+        s0: list of features that represent the first dimension in 2D subspaces
+        s1: list of features that represent the second dimension in 2D subspaces
+
+        Output:
+        freq_in_subspaces: For the current subsample, it's a "pandas series" of frequencies
+        of y in each subspace
+        """
+
+        global theta
+        n, d = self.df.shape
+
+        # elementwise comparison operation between the "y" in subsample and each raw in the original data frame
+        # summed up along subsample in a global theta array simultaneously for all the instances
+        occurances = y==numpy.array(self.df)
+        occurances0 = occurances[:,s0]
+        occurances1 = occurances[:,s1]
+
+        theta = theta + occurances0*occurances1
+
+        return 0
+
+
+
+
+    def score_dataset(self, samples_num, sample_size,seed = None):
+        """
+        Returns a numpy array of scores for every instance in the dataset
+
+        Input:
+        samples_num: number of subsamples chosen randomly without replacement
+        sample_size: size of each subsample
+        seed: seed for random number generator
+
+        Output:
+        scores: Score for each instance in the dataset (numpy array)
+        """
+        print "\ncomputing anomaly score for the dataset instances ..."
+
+        (n,d) = (self.df).shape
+        columns = (self.df).columns
+        indices = (self.df).index
+
+        zeros = numpy.zeros(d)
+        scores = numpy.zeros(n)
+        global theta
+
+        random.seed(seed)
+        # Loop over all subsamples
+        for i in range(0,samples_num):
+            #create a subsample of indices
+            d_i = random.sample(range(0,n),sample_size)
+
+            df_ndarray = numpy.array(self.df)
+            diDF = df_ndarray[d_i]
+            theta = numpy.zeros(self.df.shape)
+
+
+            #create random d random subspaces by permuting columnss 0:d-1 and pairing adjacent values
+            # randomly reorder set of column names
+            s = random.sample(range(0,d),d)
+            s0 = s[0:len(s)]
+            s1 = s[1:len(s)]+[s[0]]
+
+            # create frequency table
+            # in this version we compare each row of a subsample to the whole data frame
+            # and then sum them up along the subsample in a global theta variable
+            numpy.apply_along_axis(self.compute_frequency, 1,diDF, s0,s1)
+            scores = scores + numpy.sum(theta == zeros,axis = 1)
+
+        self.mean = scores.mean()
+        self.std = scores.std()
+
+
+        return pandas.Series((scores - self.mean)/self.std,index = indices)
+
+
+
+
+    def score_instance(self, idx, samples_num, sample_size, seed = None):
+
+        """
+        Same as pyrite but scores a single instance.
+
+        Input:
+        idx: Index of the instance whose score is desired
+        samples_num: number of subsamples chosen randomly without replacement
+        sample_size: size of each subsample
+
+        Output:
+        score: float - Anomaly Score of single_instance
+        """
+
+        n,d = self.df.shape
+
+        #y = single_instance.fillna('')
+
+        y = numpy.array(self.df.ix[idx])
+        df_ndarray = numpy.array(self.df)
+
+        zeros = numpy.zeros(d)
+        score = 0
+
+        random.seed(seed)
+        # Loop over all subsamples
+        for i in range(0,samples_num):
+
+            #create a subsample of indices
+            d_i = random.sample(range(0,n),sample_size)
+            diDF = df_ndarray[d_i]
+
+            #create random d random subspaces by permuting columnss 0:d-1 and pairing adjacent values
+            # randomly reorder set of column names
+            s = random.sample(range(0,d),d)
+            s0 = s[0:len(s)]
+            s1 = s[1:len(s)]+[s[0]]
+
+            # compute score
+            occurances = y==diDF
+            occurances0 = occurances[:,s0]
+            occurances1 = occurances[:,s1]
+            score = score + numpy.sum((occurances0*occurances1).sum(axis = 0) == zeros)
+
+        return 1.0*(score-self.mean)/self.std
+
+
+    def instance_inspect(self, idx, plot = False):
+        """
+        Compute the inverse relative frequency of a category (pair of categories) in each column (pairs of columns)
+        for categories in an anomalous instance index by idx.
+        inverse relative frequency = total # of instances/(# of instances with fixed category X # of categories for that feature )
+        Plot relative frequencies (optional).
+
+        Input:
+        idx: index of a single instance to inspect.
+        plot: Boolean - plot inverse relative frequency
+
+        Output:
+        (freq_1_d,freq_2d)
+            freq_1d: inverse relative frequencies for categories 1xd numpy array
+            freq_2d: inverse relative frequencies for pairs of categories dxd numpy array
+        """
+        n, d = self.df.shape
+
+
+        colnames = self.df.columns
+        sizes = []
+        for c in colnames:
+            sizes = sizes + [len(self.df[c].unique())]
+        sizes = numpy.array(sizes)
+
+        y = numpy.array(self.df.ix[idx])
+        df_ndarray = numpy.array(self.df)
+
+
+        # elementwise comparison operation between outlier y and each raw in the original data frame
+        # summed up along columns and divided by total number of elements per category
+        occurances = (y==df_ndarray)
+        freq_1d = 1.0/((occurances.sum(axis = 0)/(1.0*n/sizes)))
+
+        #form all feature pairs
+        freq_2d = numpy.zeros((d,d))
+        for i in range(0,d-1):
+            for j in range(i+1,d):
+                freq_2d[i,j] = 1.0/(1.0*(occurances[:,i]*occurances[:,j]).sum(axis = 0)/(1.0*n/sizes[i]/sizes[j]))
+
+
+        # plot tables
+        if plot:
+            fig = plt.figure(figsize=(10,15))
+            ax = fig.add_subplot(2,1,1)
+            plt.subplots_adjust(hspace = 0.5)
+            w=0.35
+            ax.bar(range(0,d),freq_1d)
+            ax.set_xlim(0,d)
+            ax.set_ylim(0,max(freq_1d)*1.1)
+            ax.set_ylabel('1/relative frequency of a category', fontsize = 14)
+            ax.set_title('Category inverted relative frequencies\n'+
+                         '(frequency of specified category times number of categories)\n'+
+                         'for each column',
+                            fontsize =16,)
+            xTickMarks = [str(colnames[i]) + ': ' + str(y[i]) for i in range(0,d)]
+            ax.set_xticks(numpy.array(range(0,d))+0.5)
+            xtickNames = ax.set_xticklabels(xTickMarks)
+            plt.setp(xtickNames, rotation=90, fontsize=10)
+
+            ax1 = fig.add_subplot(2,1,2)
+            ax1.set_ylabel('1/relative frequency of a pair of categories', fontsize = 14)
+            ax1.set_title('Pairs of categories inverted relative frequencies\n'+
+                          '(frequency of specified category times number ofcategories)\n'+
+                          'for each column',
+                             fontsize = 16)
+            im = ax1.imshow(freq_2d.T,interpolation = "none")
+            tickMarks = [str(colnames[i]) + ': ' + str(y[i]) for i in range(0,d)]
+            ax1.set_xticks(numpy.array(range(0,d))+0)
+            ax1.set_yticks(numpy.array(range(0,d))+0)
+            xtickNames = ax1.set_xticklabels(tickMarks)
+            ytickNames = ax1.set_yticklabels(tickMarks)
+            plt.setp(xtickNames, rotation=90, fontsize=10)
+            plt.setp(ytickNames, rotation=0, fontsize=10)
+            plt.colorbar(im)
+            plt.show()
+
+        return (freq_1d,freq_2d)
+
+
+
+
+
+    def get_feature_importance(self, idx):
+        """
+        Calls instance_inspect, selects maximum elements of the tables and organizes output into dictionary
+
+        Input:
+        idx: index of anomaly in the data frame df
+
+        Output:
+        dictionary with locations and scores of single most rare feature and single most rare column
+        """
+
+        t1,t2 = self.instance_inspect(idx, plot = False)
+        columns = list(self.df.columns)
+        d1_score = t1.max()
+        d1_loc = [columns[i] for i in numpy.where(t1 == t1.max())[0]]
+
+        d2_score = t2.max()
+        d2_loc = numpy.where(t2 == t2.max())
+        d2_loc = [(columns[d2_loc[0][i]],columns[d2_loc[1][i]]) for i in range(len(d2_loc[0]))]
+        return {'single feature':(d1_loc,d1_score),'pair features':(d2_loc,d2_score)}
+
+
diff --git a/pyrite/pyrite.py b/pyrite/pyrite.py
@@ -13,6 +13,8 @@ def __init__(self, dataset):
         # Convert input dataset to datafraem and handle nulls
         # Store dataset in self.df
         self.df = pandas.DataFrame(dataset.copy())
+        self.mean = 0.0
+        self.std = 0.0
 
 
     def auto_discretize(self,num_data,method,range_min_max):
@@ -138,9 +140,12 @@ def score_dataset(self, samples_num, sample_size,seed = None):
             numpy.apply_along_axis(self.compute_frequency, 1,diDF, s0,s1)
             scores = scores + numpy.sum(theta == zeros,axis = 1)
 
+        self.mean = scores.mean()
+        self.std = scores.std()
 
-        return pandas.Series(1.0*scores/samples_num/d, index = indices)
 
+        return pandas.Series((scores - self.mean)/self.std,index = indices)
+
 
 
 
@@ -158,8 +163,6 @@ def score_instance(self, idx, samples_num, sample_size, seed = None):
         score: float - Anomaly Score of single_instance
         """
 
-        print "\ncomputing anomaly score for a single instance ..."
-
         n,d = self.df.shape
 
         #y = single_instance.fillna('')
@@ -190,7 +193,7 @@ def score_instance(self, idx, samples_num, sample_size, seed = None):
             occurances1 = occurances[:,s1]
             score = score + numpy.sum((occurances0*occurances1).sum(axis = 0) == zeros)
 
-        return 1.0*score/samples_num/d
+        return 1.0*(score-self.mean)/self.std
 
 
     def instance_inspect(self, idx, plot = False):
@@ -286,7 +289,6 @@ def get_feature_importance(self, idx):
         Output:
         dictionary with locations and scores of single most rare feature and single most rare column
         """
-        print "\ngetting important features ..."
         t1,t2 = self.instance_inspect(idx, plot = False)
         columns = list(self.df.columns)
         d1_score = t1.max()