-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
2,837 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
/target | ||
/build | ||
/out | ||
/.idea | ||
*.log | ||
|
||
*.iml | ||
*.ipr | ||
*.iws |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
<div style="text-align: center">simplesifter</div> | ||
|
||
### 项目功能 | ||
过滤掉文本中的指定词汇列表,可用于敏感词过滤 | ||
|
||
### 原理 | ||
使用DFA状态树,最好的情况O(n),最坏情况O(n!) | ||
```properties | ||
# 最好情况举例: | ||
# 过滤词: 你好 | ||
# 文本源: 北京欢迎你 | ||
|
||
# 最坏情况举例: | ||
# 过滤词: aaaab | ||
# 文本源: aaaaa | ||
``` | ||
|
||
### 参考 | ||
[《敏感词过滤的算法原理之DFA算法》](https://blog.csdn.net/cdj0311/article/details/79789480) | ||
|
||
### 对于参考算法的改进 | ||
上述参考文章算法并不完善,因为无法处理以下情况: | ||
```properties | ||
# 过滤词: 12345 235 | ||
# 文本源: 1235 | ||
``` | ||
参考算法对以上文本源的处理是无任何命中,但很显然,235应该被命中。 | ||
解决办法有两个: | ||
1. 当匹配分支失败后,对文本源的处理应该回溯到首个匹配字符的下一个位置,继续尝试匹配,而不是从当前位置继续尝试匹配 | ||
2. 遍历文本源过程中,对每个出现的字符都在DFA中进行首字匹配,而不论当前是否有匹配的分支;如果当前有多个匹配分支,则需要同时处理多个分支 | ||
|
||
`simple-sifter`使用了第2种解决办法(其实,第1种办法应该更简单一些。可是我为什么要用第2种呢?我也想不明白了:cry:) | ||
|
||
|
||
### 使用方法 | ||
```java | ||
import com.nianxy.simplesifter; | ||
|
||
// ... | ||
|
||
// 创建一个WordSifter对象 | ||
WordSifter wordSifter = new WordSifter(); | ||
wordSifter.loadWords(words); | ||
|
||
// 打印DFA结点,可用于调试 | ||
wordSifter.printRoot(); | ||
|
||
// 创建WordSifter.Filter对象,用于过滤文本 | ||
WordSifter.Filter filter = wordSifter.createFilter(); | ||
|
||
// 可选,设置用于替换命中词的字符串,如果不设置默认为"**" | ||
filter.setReplaceStr("###"); | ||
|
||
// 过滤文本 | ||
String filtered = filter.filter("some text"); | ||
|
||
// filter对象可以重复使用,每次都可以通过setReplaceStr()设置不同的替换字符串 | ||
// 但它并不是线程安全的,如果请在不同线程内创建单独的filter对象 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.nianxy</groupId> | ||
<artifactId>simple-sifter</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
<packaging>jar</packaging> | ||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.11</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<configuration> | ||
<source>1.6</source> | ||
<target>1.6</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package com.nianxy.simplesifter; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
class StateNode { | ||
private char ch; | ||
private byte depth; | ||
private boolean end; | ||
private Map<Character,StateNode> children; | ||
|
||
public StateNode() { | ||
depth = 0; | ||
end = false; | ||
children = new HashMap<Character,StateNode>(); | ||
} | ||
|
||
public char getCh() { | ||
return ch; | ||
} | ||
|
||
public byte getDepth() { | ||
return depth; | ||
} | ||
|
||
public boolean isEnd() { | ||
return end; | ||
} | ||
|
||
public void setEnd(boolean end) { | ||
this.end = end; | ||
} | ||
|
||
public StateNode getChild(char ch) { | ||
return children.get(ch); | ||
} | ||
|
||
public StateNode addChild(char ch) { | ||
StateNode node = new StateNode(); | ||
node.ch = ch; | ||
node.depth = (byte)(depth + 1); | ||
node.end = false; | ||
children.put(ch, node); | ||
return node; | ||
} | ||
|
||
private void print(String prefix, boolean needLinkNeighbor, StateNode node) { | ||
StringBuilder line = new StringBuilder(); | ||
line.append(prefix); | ||
line.append("+ "); | ||
line.append(node.ch); | ||
if (node.isEnd()) { | ||
line.append("(E)"); | ||
} | ||
System.out.println(line); | ||
|
||
if (needLinkNeighbor) { | ||
prefix += "| "; | ||
} else { | ||
prefix += " "; | ||
} | ||
int i = 0; | ||
for (StateNode child:node.children.values()) { | ||
boolean neighbor = i<node.children.size()-1; | ||
print(prefix, neighbor, child); | ||
++i; | ||
} | ||
} | ||
|
||
/** | ||
* 打印状态树结点,可用于调试 | ||
*/ | ||
public void print() { | ||
print("", children.size()>0, this); | ||
} | ||
} |
Oops, something went wrong.