diff --git a/src/representations.py b/src/representations.py index 19e7004..c3bd980 100644 --- a/src/representations.py +++ b/src/representations.py @@ -124,21 +124,19 @@ def __init__(self, component_list, dataset): var_names = [var.name for var in k.var_list] projection_list.append(dataset.extract_component(var_names)) - print "proj list" - print projection_list - print "Frequency table of dataset :" - print dataset.frequency_matrix - print "Make sure there are no zero values or IPF won't work" - print "\nStarting IPF..." + #print "proj list" #it = np.nditer(projection_list, flags=['multi_index']) #while not it.finished: # print "%f <%s>" % (it[0], it.multi_index), # it.iternext() + print "Frequency table of dataset :" + print dataset.frequency_matrix + print "Make sure there are no zero values or IPF won't work" + print "\nStarting IPF..." # initialize q: q = np.zeros(self.var_cards) q[:] = 1./q.size # initialize with equal probs assert len(component_list) == len(projection_list) - #froe2_norm = np.sum(q**2) q_prev=q cont = True itr = 1 @@ -148,16 +146,11 @@ def __init__(self, component_list, dataset): q_proj = project_q(dataset.variable_names,var_names,q) #print q_proj q = q * (projection_list[i]/q_proj) - #new_froe2_norm = np.sum(q**2) - #cont = abs(new_froe2_norm - froe2_norm) > IPF_CONV_THRESH dif = np.max(np.abs(q-q_prev)) cont = dif > IPF_CONV_THRESH q_prev=q - #froe2_norm = new_froe2_norm - #print itr itr += 1 print "Finished after {0} iterations.\nMaximum difference from previos q was {1}".format(itr-1,dif) - #print q print "\n### Q-model ###" it = np.nditer(q, flags=['multi_index']) while not it.finished: @@ -189,6 +182,7 @@ def project_q(all_variable_names,variable_list,q): if __name__ == "__main__": + print "\n ======== Begin ============\n\n" @@ -198,14 +192,17 @@ def project_q(all_variable_names,variable_list,q): #TODO: Let the user specify a variable name to read from #and also a different variable name to refernce in Varaible. ds = Data.Dataset(raw_csv="../../SampleDatasets/StackExchange/CrossValidated_AllPosts_140119.csv", - # need to include what type to expect in order to properly clean - binners=[["Score",Data.OrdinalBinner([-1,0,5]), int ], + binners=[["Score",Data.OrdinalBinner([-1,0,5]), int ], ["FavoriteCount",Data.OrdinalBinner([0]), int ], ["AnswerCount",Data.OrdinalBinner([0]), int ], ["CommentCount",Data.OrdinalBinner([0,3]), int ], ["Body",Data.TextLengthBinner([0,50,100,300]), str ]]) + # need to include what type to expect in order to properly clean + print "N of dataset:", ds.N, "\n" + + #### Test Occam3 print function occam3_filename = "test_file.oin" print "saving ",occam3_filename,"..." @@ -216,12 +213,10 @@ def project_q(all_variable_names,variable_list,q): score = Variable(name="Score", cardinality=4, abbreviation='S') favorite_count = Variable(name="FavoriteCount", cardinality=2, abbreviation='F') answer_count = Variable(name="AnswerCount", cardinality=2, abbreviation='A') - comment_count = Variable(name="CommentCount", cardinality=2, abbreviation='C') comment_count = Variable(name="CommentCount", cardinality=3, abbreviation='C') body_length = Variable(name="Body", cardinality=5, abbreviation='B') variable_list = [score, favorite_count, answer_count, comment_count, body_length] - variable_list_ipf = [favorite_count, answer_count, comment_count] print "\nVariable List: ", ','.join(map(str,variable_list)) @@ -231,13 +226,11 @@ def project_q(all_variable_names,variable_list,q): c2 = Component([]) c3 = Component([score,favorite_count,answer_count,comment_count,body_length]) c4 = Component([score,favorite_count,body_length]) - c3_ipf = Component([favorite_count,answer_count]) - c4_ipf = Component([answer_count,comment_count]) - c5_ipf = Component([favorite_count,comment_count]) + c5 = Component([answer_count,comment_count]) #### Test component print and df functions. - #print "component: ", c1, ". degrees of freedom: ", c1.return_df() - #print "component: ", c2, ". degrees of freedom: ", c2.return_df() + print "component: ", c1, ". degrees of freedom: ", c1.return_df() + print "component: ", c2, ". degrees of freedom: ", c2.return_df() print "component: ", c3, ". degrees of freedom: ", c3.return_df() print "component: ", c4, ". degrees of freedom: ", c4.return_df() print "component: ", c5, ". degrees of freedom: ", c5.return_df() @@ -246,38 +239,35 @@ def project_q(all_variable_names,variable_list,q): ## ComponentWithData: print "\nComponentWithDatas:" - #cwd1 = ComponentWithData([score, favorite_count, body_length],ds) - #cwd2 = ComponentWithData([],ds) - #cwd3 = ComponentWithData([score,favorite_count,answer_count,comment_count,body_length],ds) + cwd1 = ComponentWithData([score, favorite_count, body_length],ds) + cwd2 = ComponentWithData([],ds) + cwd3 = ComponentWithData([score,favorite_count,answer_count,comment_count,body_length],ds) - #print "component: ",cwd1,", df: ",cwd1.return_df(),". entropy: ",cwd1.return_entropy() - #print "component: ",cwd2,", df: ",cwd2.return_df(),". entropy: ",cwd2.return_entropy() - #print "component: ",cwd3,", df: ",cwd3.return_df(),". entropy: ",cwd3.return_entropy() + print "component: ",cwd1,", df: ",cwd1.return_df(),". entropy: ",cwd1.return_entropy() + print "component: ",cwd2,", df: ",cwd2.return_df(),". entropy: ",cwd2.return_entropy() + print "component: ",cwd3,", df: ",cwd3.return_df(),". entropy: ",cwd3.return_entropy() ## Model: - #print "\nModels:" + print "\nModels:" - #m1 = Model([c3]) #model of one component - #m2 = Model([c4,c5]) #model of c4 and c5 - m2 = Model([c3,c4,c5]) #model of c4 and c5 - + m1 = Model([c3]) #model of one component + m2 = Model([c4,c5]) #model of c4 and c5 - #print "Model: ",m1,", df: " + print "Model: ",m1,", df: " print "Model: ",m2,", df: " ## ModelWithData: - print "\nModelWithDatas:" + #print "\nModelWithDatas:" #print "mwd1" #mwd1 = ModelWithData([c3],ds) #model of one component #print mwd1 - print "mwd2" + #print "mwd2" #mwd2 = ModelWithData([c4,c5],ds) #model of c4 and c5 - mwd2 = ModelWithData([c3,c4,c5],ds) #model of c4 and c5 - print mwd2 + #print mwd2 # print "Model: ",m1,", df: " @@ -286,13 +276,54 @@ def project_q(all_variable_names,variable_list,q): print "\n\n ======== End ============\n" - ds_ipf = Data.Dataset(raw_csv="../../SampleDatasets/StackExchange/CrossValidated_AllPosts_140119.csv", + print "\n ======== Begin IPF Example ============\n\n" + + ## Dataset: + + #TODO: make binning object a part of Variable + #TODO: Let the user specify a variable name to read from + #and also a different variable name to refernce in Varaible. + ds = Data.Dataset(raw_csv="../../SampleDatasets/StackExchange/CrossValidated_AllPosts_140119.csv", binners=[["FavoriteCount",Data.OrdinalBinner([1]), int ], ["AnswerCount",Data.OrdinalBinner([5]), int ], ["CommentCount",Data.OrdinalBinner([1]), int ]]) print "N of dataset:", ds.N, "\n" + #### Test Occam3 print function + occam3_filename = "test_file.oin" + print "saving ",occam3_filename,"..." + ds.save_as_occam3_format(occam3_filename) + + + ## Variable: + favorite_count = Variable(name="FavoriteCount", cardinality=2, abbreviation='F') + answer_count = Variable(name="AnswerCount", cardinality=2, abbreviation='A') + comment_count = Variable(name="CommentCount", cardinality=2, abbreviation='C') + + variable_list_ipf = [favorite_count, answer_count, comment_count] + print "\nVariable List: ", ','.join(map(str,variable_list)) + + ## Component: + print "\nComponents:" + c3 = Component([favorite_count,answer_count]) + c4 = Component([answer_count,comment_count]) + c5 = Component([favorite_count,comment_count]) + + #### Test component print and df functions. + print "component: ", c3, ". degrees of freedom: ", c3.return_df() + print "component: ", c4, ". degrees of freedom: ", c4.return_df() + print "component: ", c5, ". degrees of freedom: ", c5.return_df() + + + ## ModelWithData: + # with loops + print "\nModelWithDatas:" + print "mwd1" + mwd1 = ModelWithData([c3,c4,c5],ds) #model of c4 and c5 + print mwd1 + + print "\n\n ======== End ============\n" # E0: entropy of a single component