-
Notifications
You must be signed in to change notification settings - Fork 28
User aggregators #95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
User aggregators #95
Changes from 10 commits
917e118
a6e1c3d
a31e918
e4b0f05
ba6b2ff
1829489
de27178
a7fd7b6
81e8f37
2972425
b161ab2
81af781
3295b15
a05a385
ff5b37b
6b6aa9f
0da11ef
737060d
db0a04f
98eb3ac
fb23150
491adfc
72711ce
edf12ff
656775d
e691a5b
390fc86
6ecf209
fec8ee8
739eb3c
459000f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| p: Project = input; | ||
| type fv = {a:int, b:int, c:int, d:int}; | ||
| type stats = {a_stat:float, b_stat:float, c_stat:float}; | ||
| type complete_stat = {avg: stats, dev: stats}; | ||
| type Data = {training: fv, testing: fv}; | ||
| splitRatio : float = 0.67; | ||
|
|
||
| naive := function(vals : array of Data) : float { | ||
| train : array of fv; | ||
| test : array of fv; | ||
|
|
||
| spearated: map[int] of array of fv; # classified per value | ||
| summaries : map[int] of complete_stat; | ||
|
|
||
| # separate the training and testing datasets | ||
| foreach(i:int; def(vals[i])) { | ||
| if(def(train)) { | ||
| train = train + {vals[i].training}; | ||
| } else { | ||
| train = {vals[i].training}; | ||
| } | ||
| if(def(test)) { | ||
| test = test+ {vals[i].testing}; | ||
| } else { | ||
| test = {vals[i].testing}; | ||
| } | ||
|
|
||
| } | ||
|
|
||
|
|
||
| # classify training datasets | ||
| foreach(i:int; def(train[i])) { | ||
| temp : array of fv = {train[i]}; | ||
| if(!haskey(spearated, train[i].d)) { | ||
| spearated[train[i].d] = temp; | ||
| } else { | ||
| spearated[train[i].d] = spearated[train[i].d] + temp; | ||
| } | ||
| } | ||
|
|
||
| # all the classes | ||
| classes : array of int = keys(spearated); | ||
|
|
||
| # summarize data from training dataset | ||
| foreach(i:int; def(classes[i])) { | ||
| # calculate mean | ||
| feature_mean : stats = {0.0, 0.0, 0.0}; | ||
| foreach(j:int; def(spearated[classes[i]][j])) { | ||
| feature_mean.a_stat = feature_mean.a_stat + spearated[classes[i]][j].a; | ||
| feature_mean.b_stat = feature_mean.b_stat + spearated[classes[i]][j].b; | ||
| feature_mean.c_stat = feature_mean.c_stat + spearated[classes[i]][j].c; | ||
| } | ||
| feature_mean.a_stat = feature_mean.a_stat / len(spearated[classes[i]]); | ||
| feature_mean.b_stat = feature_mean.b_stat / len(spearated[classes[i]]); | ||
| feature_mean.c_stat = feature_mean.c_stat / len(spearated[classes[i]]); | ||
|
|
||
|
|
||
| # calculate sd | ||
| feature_sd : stats = {0.0, 0.0, 0.0}; | ||
| foreach(j:int; def(spearated[classes[i]][j])) { | ||
| feature_sd.a_stat = feature_sd.a_stat + (spearated[classes[i]][j].a - feature_mean.a_stat); | ||
| feature_sd.b_stat = feature_sd.b_stat + (spearated[classes[i]][j].b - feature_mean.b_stat); | ||
| feature_sd.c_stat = feature_sd.c_stat + (spearated[classes[i]][j].c - feature_mean.c_stat); | ||
| } | ||
| feature_sd.a_stat = sqrt(feature_sd.a_stat / len(spearated[classes[i]])); | ||
| feature_sd.b_stat = sqrt(feature_sd.b_stat / len(spearated[classes[i]])); | ||
| feature_sd.c_stat = sqrt(feature_sd.c_stat / len(spearated[classes[i]])); | ||
|
|
||
| # summarized a class | ||
| summaries[classes[i]] = {feature_mean, feature_sd}; | ||
| } | ||
|
|
||
|
|
||
| predictions: array of int; | ||
| predictions = new(predictions, len(test), -1); | ||
|
|
||
| # predict for each test data | ||
| foreach(i:int; def(test[i])) { | ||
| probabilities : map[int] of float; | ||
| foreach(j: int; def(classes[j])) { | ||
| probabilities[classes[j]] = 1.0; | ||
| mean := summaries[classes[j]].avg; | ||
| deviation := summaries[classes[j]].dev; | ||
| probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].a) - mean.a_stat, 2))/(2 * pow(deviation.a_stat, 2))))); | ||
| probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].b) - mean.b_stat, 2))/(2 * pow(deviation.b_stat, 2))))); | ||
| probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].c) - mean.c_stat, 2))/(2 * pow(deviation.c_stat, 2))))); | ||
| } | ||
|
|
||
| bestProb : float = 0; | ||
| bestLab : int = -1; | ||
| foreach(j: int; def(classes[j])) { | ||
| if ((bestLab == -1) || (bestProb < probabilities[classes[j]])) { | ||
| bestProb = probabilities[classes[j]]; | ||
| bestLab = classes[j]; | ||
| } | ||
| } | ||
| predictions[i] = bestLab; | ||
| } | ||
|
|
||
| correct : float = 0.0; | ||
| foreach(i:int; def(test[i])) { | ||
| if(predictions[i] == test[i].d) { | ||
| correct = correct + 1.0; | ||
| } | ||
| } | ||
| return correct/len(test) * 100; | ||
| }; | ||
|
|
||
| scale := function(ast: int, method: int, class: int) : int { | ||
| total : int = 0; | ||
| if(ast > 1000) { | ||
| total++; | ||
| } if(method > 500) { | ||
| total++; | ||
| } if(class > 50) { | ||
| total++; | ||
| } | ||
| return total; | ||
| }; | ||
|
|
||
|
|
||
| naive_bayes : output naive of Data; | ||
|
|
||
| # count ast nodes | ||
|
|
||
| astCount := 0; | ||
| classCount := 0; | ||
| methodCount := 0; | ||
| visit(p, visitor { | ||
| # only look at the latest snapshot | ||
| before n: CodeRepository -> { | ||
| snapshot := getsnapshot(n); | ||
| foreach (i: int; def(snapshot[i])) | ||
| visit(snapshot[i]); | ||
| stop; | ||
| } | ||
| before node: Declaration -> { | ||
| if (node.kind == TypeKind.CLASS) { | ||
| classCount++; | ||
| foreach (i: int; node.methods[i]) { | ||
| methodCount++; | ||
| } | ||
| } | ||
| } | ||
| # by default, count all visited nodes | ||
| before _ -> astCount++; | ||
| # these nodes are not part of the AST, so do nothing when visiting | ||
| before Project, ChangedFile -> ; | ||
| }); | ||
|
|
||
|
|
||
|
|
||
| dummy : fv = {0, 0, 0, 0}; | ||
| nondummy : fv = {astCount, methodCount, classCount, scale(astCount, methodCount, classCount)}; | ||
| data1: Data = {nondummy, dummy}; | ||
| data2: Data = {dummy, nondummy}; | ||
| if(rand() > splitRatio) | ||
| naive_bayes << data1; | ||
| else | ||
| naive_bayes << data2; | ||
|
|
||
|
|
||
| if(rand() > splitRatio) | ||
| naive_bayes << data1; | ||
| else | ||
| naive_bayes << data2; | ||
|
|
||
|
|
||
| if(rand() > splitRatio) | ||
| naive_bayes << data1; | ||
| else | ||
| naive_bayes << data2; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| package boa; | ||
|
|
||
| public interface BoaEnumInterface { | ||
| Object getValue(); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| package boa; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
|
|
||
| public interface BoaTup { | ||
| public String[] getValues(); | ||
| public byte[] serialize(Object o) throws IOException; | ||
| public Object getValue(String f); | ||
| public String toString(); | ||
| public String[] getFieldNames(); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| package boa.aggregators; | ||
|
|
||
|
|
||
| @AggregatorSpec(name = "UserDefinedAgg", type = "UserDefined", canCombine = false) | ||
| public abstract class UserDefinedAggregator extends Aggregator { | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ | |
| import java.util.*; | ||
| import java.util.Map.Entry; | ||
|
|
||
| import boa.aggregators.UserDefinedAggregator; | ||
| import org.scannotation.AnnotationDB; | ||
|
|
||
| import boa.aggregators.AggregatorSpec; | ||
|
|
@@ -435,15 +436,28 @@ else if (aggregators.containsKey(name)) | |
| } | ||
|
|
||
| public List<Class<?>> getAggregators(final String name, final BoaType type) { | ||
| final List<Class<?>> aggregators = new ArrayList<Class<?>>(); | ||
|
|
||
| if (type instanceof BoaTuple) | ||
| for (final BoaType subType : ((BoaTuple) type).getTypes()) | ||
| aggregators.add(this.getAggregator(name, subType)); | ||
| else | ||
| aggregators.add(this.getAggregator(name, type)); | ||
| final List<Class<?>> searchResult = new ArrayList<Class<?>>(); | ||
| if (possibleInBuiltAgg(name)) { | ||
| if (type instanceof BoaTuple) | ||
| searchResult.add(this.getAggregator(name, type)); | ||
| else if (type instanceof BoaArray) | ||
| searchResult.add(this.getAggregator(name, ((BoaArray)type).getType())); | ||
| else | ||
| searchResult.add(this.getAggregator(name, type)); | ||
| } else if (this.functions.hasFunction(name)) { | ||
| searchResult.add(UserDefinedAggregator.class); | ||
| } | ||
| return searchResult; | ||
| } | ||
|
|
||
| return aggregators; | ||
| private boolean possibleInBuiltAgg(String name) { | ||
| Set<String> names = aggregators.keySet(); | ||
| for (final String entry: names) { | ||
| if(entry.contains(name)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be contains()? so if I have a built in named 'top' and I try to use an aggregator 'op' it matches? I think you just want a simple |
||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| private static void importFunction(final Method m) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this to 'test/known-good/' ? We dont have an examples directory and if you are going to put code examples in there, might as well use them as test cases.