first commit

nianxy · Sep 30, 2019 · 53bf97d · 53bf97d
1 parent c851fa4
commit 53bf97d
Show file tree

Hide file tree

Showing 8 changed files with 2,837 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+/target
+/build
+/out
+/.idea
+*.log
+
+*.iml
+*.ipr
+*.iws
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+<div style="text-align: center">simplesifter</div>
+
+### 项目功能
+过滤掉文本中的指定词汇列表，可用于敏感词过滤
+
+### 原理
+使用DFA状态树，最好的情况O(n)，最坏情况O(n!)
+```properties
+# 最好情况举例：
+# 过滤词: 你好
+# 文本源: 北京欢迎你
+
+# 最坏情况举例：
+# 过滤词: aaaab
+# 文本源: aaaaa
+```
+
+### 参考
+[《敏感词过滤的算法原理之DFA算法》](https://blog.csdn.net/cdj0311/article/details/79789480)
+
+### 对于参考算法的改进
+上述参考文章算法并不完善，因为无法处理以下情况：
+```properties
+# 过滤词： 12345 235
+# 文本源： 1235
+```
+参考算法对以上文本源的处理是无任何命中，但很显然，235应该被命中。
+解决办法有两个：
+1. 当匹配分支失败后，对文本源的处理应该回溯到首个匹配字符的下一个位置，继续尝试匹配，而不是从当前位置继续尝试匹配
+2. 遍历文本源过程中，对每个出现的字符都在DFA中进行首字匹配，而不论当前是否有匹配的分支；如果当前有多个匹配分支，则需要同时处理多个分支
+
+`simple-sifter`使用了第2种解决办法（其实，第1种办法应该更简单一些。可是我为什么要用第2种呢？我也想不明白了:cry:）
+
+
+### 使用方法
+```java
+import com.nianxy.simplesifter;
+
+// ...
+
+// 创建一个WordSifter对象
+WordSifter wordSifter = new WordSifter();
+wordSifter.loadWords(words);
+
+// 打印DFA结点，可用于调试
+wordSifter.printRoot();
+
+// 创建WordSifter.Filter对象，用于过滤文本
+WordSifter.Filter filter = wordSifter.createFilter();
+
+// 可选，设置用于替换命中词的字符串，如果不设置默认为"**"
+filter.setReplaceStr("###");
+
+// 过滤文本
+String filtered = filter.filter("some text");
+
+// filter对象可以重复使用，每次都可以通过setReplaceStr()设置不同的替换字符串
+// 但它并不是线程安全的，如果请在不同线程内创建单独的filter对象
+```
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.nianxy</groupId>
+  <artifactId>simple-sifter</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+
+</project>
diff --git a/src/main/java/com/nianxy/simplesifter/StateNode.java b/src/main/java/com/nianxy/simplesifter/StateNode.java
@@ -0,0 +1,76 @@
+package com.nianxy.simplesifter;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class StateNode {
+    private char ch;
+    private byte depth;
+    private boolean end;
+    private Map<Character,StateNode> children;
+
+    public StateNode() {
+        depth = 0;
+        end = false;
+        children = new HashMap<Character,StateNode>();
+    }
+
+    public char getCh() {
+        return ch;
+    }
+
+    public byte getDepth() {
+        return depth;
+    }
+
+    public boolean isEnd() {
+        return end;
+    }
+
+    public void setEnd(boolean end) {
+        this.end = end;
+    }
+
+    public StateNode getChild(char ch) {
+        return children.get(ch);
+    }
+
+    public StateNode addChild(char ch) {
+        StateNode node = new StateNode();
+        node.ch = ch;
+        node.depth = (byte)(depth + 1);
+        node.end = false;
+        children.put(ch, node);
+        return node;
+    }
+
+    private void print(String prefix, boolean needLinkNeighbor, StateNode node) {
+        StringBuilder line = new StringBuilder();
+        line.append(prefix);
+        line.append("+ ");
+        line.append(node.ch);
+        if (node.isEnd()) {
+            line.append("(E)");
+        }
+        System.out.println(line);
+
+        if (needLinkNeighbor) {
+            prefix += "| ";
+        } else {
+            prefix += "  ";
+        }
+        int i = 0;
+        for (StateNode child:node.children.values()) {
+            boolean neighbor = i<node.children.size()-1;
+            print(prefix, neighbor, child);
+            ++i;
+        }
+    }
+
+    /**
+     * 打印状态树结点，可用于调试
+     */
+    public void print() {
+        print("", children.size()>0, this);
+    }
+}