Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/bloom filter for dict #1397

Merged
merged 16 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
db33344
feat: created bloom filter for english dictionary words
TangoBeee Aug 22, 2024
37425d6
Merge branch 'akto-api-security:master' into feature/bloom_filter_for…
TangoBeee Aug 22, 2024
fbc807d
fix: loading the dictionary binary while initializing context
TangoBeeAkto Aug 23, 2024
550559b
fix: added a camel case check for isEnglishWord and fixed some bugs
TangoBeeAkto Aug 23, 2024
afcc9bd
fix: loading dictionary binary in TestDBSync
TangoBeeAkto Aug 23, 2024
554c161
feat: added more unit tests and added two letters words in the dictio…
TangoBeeAkto Aug 26, 2024
48863a6
removed words_alpha.txt from the resource folder
TangoBeeAkto Aug 27, 2024
f763179
feat: created bloom filter for english dictionary words
TangoBeee Aug 22, 2024
3c8b8ca
fix: loading the dictionary binary while initializing context
TangoBeeAkto Aug 23, 2024
93e1ea4
fix: added a camel case check for isEnglishWord and fixed some bugs
TangoBeeAkto Aug 23, 2024
62e6067
fix: loading dictionary binary in TestDBSync
TangoBeeAkto Aug 23, 2024
c635919
feat: added more unit tests and added two letters words in the dictio…
TangoBeeAkto Aug 26, 2024
3764ffd
removed words_alpha.txt from the resource folder
TangoBeeAkto Aug 27, 2024
bba8511
Merge remote-tracking branch 'origin/feature/bloom_filter_for_dict' i…
TangoBeeAkto Aug 30, 2024
c7e85c1
feat: inserting demerged urls in db and stopping them from merging again
TangoBeeAkto Sep 2, 2024
2e1de40
Merge branch 'master' into feature/bloom_filter_for_dict
TangoBeeAkto Sep 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import com.akto.DaoInit;
import com.akto.dao.*;
import com.akto.dao.context.Context;
import com.akto.dto.*;
Expand All @@ -21,6 +20,7 @@
import com.akto.dto.usage.MetricTypes;
import com.akto.log.LoggerMaker;
import com.akto.log.LoggerMaker.LogDb;
import com.akto.util.filter.DictionaryFilter;
import com.akto.runtime.merge.MergeOnHostOnly;
import com.akto.runtime.policies.AktoPolicyNew;
import com.akto.task.Cluster;
Expand All @@ -35,23 +35,18 @@
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import com.mongodb.BasicDBObject;
import com.mongodb.ConnectionString;
import com.mongodb.bulk.BulkWriteResult;
import com.mongodb.client.model.*;
import com.mongodb.client.result.UpdateResult;
import org.apache.commons.lang3.math.NumberUtils;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.json.JsonParseException;
import org.bson.types.ObjectId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import static com.akto.dto.type.KeyTypes.patternToSubType;

Expand Down Expand Up @@ -717,6 +712,7 @@ public static URLTemplate tryParamteresingUrl(URLStatic newUrl){
int start = newUrl.getUrl().startsWith("http") ? 3 : 0;
for(int i = start; i < tokens.length; i ++) {
String tempToken = tokens[i];
if(DictionaryFilter.isEnglishWord(tempToken)) continue;

if (NumberUtils.isParsable(tempToken)) {
newTypes[i] = isNumber(tempToken) ? SuperType.INTEGER : SuperType.FLOAT;
Expand Down Expand Up @@ -768,6 +764,7 @@ public static URLTemplate tryMergeUrls(URLStatic dbUrl, URLStatic newUrl) {
for(int i = 0; i < newTokens.length; i ++) {
String tempToken = newTokens[i];
String dbToken = dbTokens[i];
if (DictionaryFilter.isEnglishWord(tempToken) && DictionaryFilter.isEnglishWord(dbToken)) continue;
TangoBeeAkto marked this conversation as resolved.
Show resolved Hide resolved

int minCount = dbUrl.getUrl().startsWith("http") && newUrl.getUrl().startsWith("http") ? 3 : 0;
if (tempToken.equalsIgnoreCase(dbToken) || i < minCount) {
Expand Down
4 changes: 3 additions & 1 deletion apps/api-runtime/src/main/java/com/akto/runtime/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import com.akto.DaoInit;
Expand All @@ -16,6 +15,7 @@
import com.akto.log.LoggerMaker;
import com.akto.log.LoggerMaker.LogDb;
import com.akto.parsers.HttpCallParser;
import com.akto.util.filter.DictionaryFilter;
import com.akto.util.AccountTask;
import com.akto.util.DashboardMode;
import com.google.gson.Gson;
Expand Down Expand Up @@ -157,6 +157,8 @@ public static void main(String[] args) {
}
int maxPollRecordsConfig = Integer.parseInt(System.getenv("AKTO_KAFKA_MAX_POLL_RECORDS_CONFIG"));

DictionaryFilter.readDictionaryBinary();
TangoBeeAkto marked this conversation as resolved.
Show resolved Hide resolved

if (topicName == null) topicName = "akto.api.logs";

DaoInit.init(new ConnectionString(mongoURI));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.akto.dto.type.*;
import com.akto.runtime.APICatalogSync;
import com.akto.types.CappedSet;
import com.akto.util.filter.DictionaryFilter;
import com.akto.utils.RedactSampleData;
import com.google.api.client.util.Charsets;
import com.google.common.hash.BloomFilter;
Expand All @@ -25,6 +26,8 @@
import com.mongodb.client.model.Updates;
import org.bson.conversions.Bson;
import org.bson.types.ObjectId;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.util.*;
Expand All @@ -36,6 +39,14 @@

public class TestMergingNew extends MongoBasedTest {

@Before
public void initMain() {
DictionaryFilter.readDictionaryBinary();
}




public void testInitializer(){
Map<String, AktoDataType> aktoDataTypeMap = new HashMap<>();
aktoDataTypeMap.put("JWT", new AktoDataType(null, false, null, 0, new IgnoreData(new HashMap<>(), new HashSet<>()), false, true));
Expand Down Expand Up @@ -95,6 +106,80 @@ public void testMultipleIntegerMerging() {

}

@Test
public void testStringMerging() {
testInitializer();
SingleTypeInfoDao.instance.getMCollection().drop();
ApiCollectionsDao.instance.getMCollection().drop();
HttpCallParser parser = new HttpCallParser("userIdentifier", 1, 1, 1, true);

String baseUrl = "/api/";
List<HttpResponseParams> responseParams = new ArrayList<>();
List<String> urls = Arrays.asList(
baseUrl + "demo",
baseUrl + "cat",
baseUrl + "31a1a7c5-b4e3-47f5-8579-f7fc044c6a98",
baseUrl + "tree"
);

for (String c : urls) {
HttpResponseParams resp = createSampleParams("user1", c);
responseParams.add(resp);
}

parser.syncFunction(responseParams, false, true, null);
parser.apiCatalogSync.syncWithDB(false, true, SyncLimit.noLimit);
APICatalogSync.mergeUrlsAndSave(123, true, false, parser.apiCatalogSync.existingAPIsInDb);
parser.apiCatalogSync.buildFromDB(true, true);
Map<URLTemplate, RequestTemplate> urlTemplateMap = parser.apiCatalogSync.getDbState(123).getTemplateURLToMethods();
Map<URLStatic, RequestTemplate> strictUrlMap = parser.apiCatalogSync.getDbState(123).getStrictURLToMethods();


assertEquals(1, urlTemplateMap.size());
assertEquals(3, strictUrlMap.size());
}

@Test
public void testEnglishWordsUrlTestString() {
testInitializer();
SingleTypeInfoDao.instance.getMCollection().drop();
ApiCollectionsDao.instance.getMCollection().drop();
HttpCallParser parser = new HttpCallParser("userIdentifier", 1, 1, 1, true);
String url = "/link/";
List<HttpResponseParams> responseParams = new ArrayList<>();
List<String> urls = new ArrayList<>();
for (String x: Arrays.asList(
"apple", "banana", "cat", "dog", "elephant", "flower", "guitar", "house",
"island", "jungle", "kite", "lemon", "mountain", "night", "ocean", "piano",
"queen", "river", "sun", "tree", "umbrella", "village", "whale", "xylophone",
"yacht", "zebra", "bird", "clock", "desert", "engine", "forest", "garden",
"honey", "igloo", "jacket", "kangaroo", "lamp", "mirror", "notebook", "orange",
"pencil", "quilt", "rain", "star", "telephone", "uniform", "violin", "window",
"yellow", "zipper"
)) {
urls.add(url+x);
}
for (String c: urls) {
HttpResponseParams resp = createSampleParams("user1", c);
responseParams.add(resp);
}

parser.syncFunction(responseParams.subList(0,23), false, true, null);
parser.apiCatalogSync.syncWithDB(false, true, SyncLimit.noLimit);
assertEquals(23, getStaticURLsSize(parser));

parser.syncFunction(responseParams.subList(23,28), false, true, null);
parser.apiCatalogSync.syncWithDB(false, true, SyncLimit.noLimit);
APICatalogSync.mergeUrlsAndSave(123,true, false, parser.apiCatalogSync.existingAPIsInDb);
parser.apiCatalogSync.buildFromDB(false, true);
assertEquals(28, getStaticURLsSize(parser));

parser.syncFunction(responseParams.subList(28,33), false, true, null);
parser.apiCatalogSync.syncWithDB(false, true, SyncLimit.noLimit);
assertEquals(33, getStaticURLsSize(parser));
}


public int getStaticURLsSize(HttpCallParser parser) {
Map<URLStatic, RequestTemplate> urlStaticMap = parser.apiCatalogSync.getDbState(123).getStrictURLToMethods();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import com.akto.testing.HostDNSLookup;
import com.akto.usage.UsageMetricHandler;
import com.akto.testing.workflow_node_executor.Utils;
import com.akto.util.filter.DictionaryFilter;
import com.akto.utils.jobs.JobUtils;
import com.akto.util.AccountTask;
import com.akto.util.ConnectionInfo;
Expand Down Expand Up @@ -1888,6 +1889,7 @@ public static boolean isNotKubernetes() {
@Override
public void contextInitialized(javax.servlet.ServletContextEvent sce) {
setSubdomain();
DictionaryFilter.readDictionaryBinary();

String https = System.getenv("AKTO_HTTPS_FLAG");
if (Objects.equals(https, "true")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import com.akto.DaoInit;
import com.akto.dao.*;
import com.akto.dao.context.Context;
import com.akto.dto.*;
import com.akto.dto.HttpResponseParams.Source;
import com.akto.dto.bulk_updates.BulkUpdates;
import com.akto.dto.bulk_updates.UpdatePayload;
import com.akto.dto.traffic.Key;
Expand All @@ -19,36 +17,26 @@
import com.akto.dto.type.SingleTypeInfo.SubType;
import com.akto.dto.type.SingleTypeInfo.SuperType;
import com.akto.dto.type.URLMethods.Method;
import com.akto.hybrid_parsers.HttpCallParser;
import com.akto.log.LoggerMaker;
import com.akto.log.LoggerMaker.LogDb;
import com.akto.data_actor.DataActor;
import com.akto.data_actor.DataActorFactory;
import com.akto.hybrid_runtime.merge.MergeOnHostOnly;
import com.akto.hybrid_runtime.policies.AktoPolicyNew;
import com.akto.task.Cluster;
import com.akto.util.filter.DictionaryFilter;
import com.akto.types.CappedSet;
import com.akto.util.JSONUtils;
import com.akto.utils.RedactSampleData;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.mongodb.BasicDBObject;
import com.mongodb.ConnectionString;
import com.mongodb.bulk.BulkWriteResult;
import com.mongodb.client.model.*;
import com.mongodb.client.result.UpdateResult;
import org.apache.commons.lang3.math.NumberUtils;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.json.JsonParseException;
import org.bson.types.ObjectId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import static com.akto.dto.type.KeyTypes.patternToSubType;

public class APICatalogSync {
Expand Down Expand Up @@ -325,6 +313,8 @@ private void tryMergingWithKnownStrictURLs(
while (iterator.hasNext()) {
Map.Entry<URLStatic, RequestTemplate> entry = iterator.next();
URLStatic newUrl = entry.getKey();
if(DictionaryFilter.isEnglishWord(newUrl.getUrl())) continue;
TangoBeeAkto marked this conversation as resolved.
Show resolved Hide resolved

RequestTemplate newTemplate = entry.getValue();
String[] tokens = tokenize(newUrl.getUrl());

Expand Down Expand Up @@ -636,6 +626,8 @@ private void tryWithKnownURLTemplates(
while (iterator.hasNext()) {
Map.Entry<URLStatic, RequestTemplate> entry = iterator.next();
URLStatic newUrl = entry.getKey();
if(DictionaryFilter.isEnglishWord(newUrl.getUrl())) continue;
TangoBeeAkto marked this conversation as resolved.
Show resolved Hide resolved

RequestTemplate newRequestTemplate = entry.getValue();

for (URLTemplate urlTemplate: dbCatalog.getTemplateURLToMethods().keySet()) {
Expand Down
3 changes: 2 additions & 1 deletion libs/dao/src/main/java/com/akto/dto/type/URLTemplate.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.akto.dto.type.SingleTypeInfo.SuperType;
import com.akto.dto.type.URLMethods.Method;

import com.akto.util.filter.DictionaryFilter;
import org.bson.codecs.pojo.annotations.BsonDiscriminator;
import org.bson.codecs.pojo.annotations.BsonId;
import org.apache.commons.lang3.math.NumberUtils;
Expand Down Expand Up @@ -70,7 +71,7 @@ public boolean match(String[] url, Method urlMethod) {

if (thisToken == null) {
SuperType type = types[i];

if (DictionaryFilter.isEnglishWord(thatToken)) return false;
switch(type) {
case BOOLEAN:
if (!"true".equals(thatToken.toLowerCase()) && !"false".equals(thatToken.toLowerCase())) return false;
Expand Down
36 changes: 36 additions & 0 deletions libs/dao/src/main/java/com/akto/util/filter/DictionaryFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.akto.util.filter;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.nio.charset.StandardCharsets;

public class DictionaryFilter {
private static final Logger logger = LoggerFactory.getLogger(DictionaryFilter.class);
public static BloomFilter<CharSequence> dictFilter = null;

public static void readDictionaryBinary() {
try (InputStream binary = DictionaryFilter.class.getResourceAsStream("/DictionaryBinary")) {
logger.info("reading dictionary binary");
dictFilter = BloomFilter.readFrom(binary, Funnels.stringFunnel(StandardCharsets.UTF_8));
} catch (Exception e) {
logger.error("Error while reading bloom filter binary: " + e.getMessage(), e);
}
}

public static boolean isEnglishWord(String word) {
if(dictFilter == null || word.trim().isEmpty()) return false;

String[] wordSegments = word.split("[-_.]");

for(String seg : wordSegments) {
if(!seg.isEmpty() && !dictFilter.mightContain(seg.toUpperCase())) return false;
}

return true;
}

}
Binary file added libs/dao/src/main/resources/DictionaryBinary
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package com.akto.utils.filter;

import com.akto.util.filter.DictionaryFilter;
import org.junit.Before;
import org.junit.Test;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class TestDictionaryFilter {
@Before
public void initMain() {
DictionaryFilter.readDictionaryBinary();
}


@Test
public void testValidEnglishWord() {
assertTrue(DictionaryFilter.isEnglishWord("demo"));
assertTrue(DictionaryFilter.isEnglishWord("cat"));
assertTrue(DictionaryFilter.isEnglishWord("example"));
}

@Test
public void testInvalidEnglishWord() {
assertFalse(DictionaryFilter.isEnglishWord("xyzabc"));
assertFalse(DictionaryFilter.isEnglishWord("nonexistentword"));
}

@Test
public void testHyphenatedWords() {
assertTrue(DictionaryFilter.isEnglishWord("well-known"));
assertFalse(DictionaryFilter.isEnglishWord("well-known-xyzabc"));
}

@Test
public void testUnderscoreSeparatedWords() {
assertTrue(DictionaryFilter.isEnglishWord("black_white"));
assertTrue(DictionaryFilter.isEnglishWord("red_blue"));
assertFalse(DictionaryFilter.isEnglishWord("red_blue_xyzabc"));
}

@Test
public void testDotSeparatedWords() {
assertTrue(DictionaryFilter.isEnglishWord("hello.world"));
assertTrue(DictionaryFilter.isEnglishWord("good.bye"));
assertFalse(DictionaryFilter.isEnglishWord("hello.world.xyzabc"));
}

@Test
public void testEmptyString() {
assertFalse(DictionaryFilter.isEnglishWord(""));
}

@Test
public void testMixedCaseWords() {
assertTrue(DictionaryFilter.isEnglishWord("Demo"));
assertTrue(DictionaryFilter.isEnglishWord("CaT"));
assertFalse(DictionaryFilter.isEnglishWord("NotExist"));
TangoBeeAkto marked this conversation as resolved.
Show resolved Hide resolved
}

}
Loading