Skip to content

Commit

Permalink
Make regular expression to extract URLs from CSS more restrictive:
Browse files Browse the repository at this point in the history
merged improvements from iipc#63
  • Loading branch information
sebastian-nagel committed Jan 18, 2017
1 parent da92adb commit eb66fc4
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,18 @@ public class ExtractingParseObserver implements ParseObserver {

protected static String cssUrlPatString =
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
protected static String cssUrlTrimPatString =
"^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
protected static String cssImportNoUrlPatString =
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";

protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);

protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);

protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);

private final static int MAX_TEXT_LEN = 100;

// private static String GLOBAL_ATTR[] = {"background"};
Expand Down Expand Up @@ -417,22 +422,10 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
while((idx < contentLen) && m.find()) {
idx = m.end();
String url = m.group(1);
if(url.length() < 2) {
continue;
}
if ((url.charAt(0) == '(')
&& (url.charAt(url.length()-1) == ')')) {
url = url.substring(1, url.length() - 1);
}
if (url.charAt(0) == '"' || url.charAt(0) == '\'') {
url = url.substring(1, url.length() - 1);
} else if (url.charAt(0) == '\\') {
if(url.length() <= 4) {
continue;
}
url = url.substring(2, url.length() - 2);
url = cssUrlTrimPattern.matcher(url).replaceAll("");
if (!url.isEmpty()) {
data.addHref("path","STYLE/#text","href", url);
}
data.addHref("path","STYLE/#text","href",url);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception {
"url (' ')",
"url('\")",
"url(')",
"url('\"')"
"url('\"')",
"url('\\\"\"')",
"url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
Expand All @@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception {
assertFalse(except);
}
}

public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
Expand All @@ -45,31 +48,36 @@ public void testHandleStyleNode() throws Exception {
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},

};
{"url(''foo.gif'')","foo.gif"},
{"url( foo.gif )","foo.gif"},
{"url('''')"},
{"url('foo.gif'')","foo.gif"},
};
for(String[] testa : tests) {
checkExtract(testa);
}
// boolean except = false;
// HTMLMetaData md = new HTMLMetaData(new MetaData());
// ExtractingParseObserver epo = new ExtractingParseObserver(md);
// for(String css : tests) {
// try {
// TextNode tn = new TextNode(css);
// epo.handleStyleNode(tn);
// } catch(Exception e) {
// System.err.format("And the winner is....(%s)\n", css);
// e.printStackTrace();
// except = true;
// throw e;
// }
// assertFalse(except);
// }
}

/**
* Test whether the pattern matcher does extract nothing and also does not
* not hang-up if an overlong CSS link is truncated.
*/
public void testHandleStyleNodeNoHangupTruncated() throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("url(");
for (int i = 0; i < 500000; i++)
sb.append('\'');
sb.append("foo.gif");
for (int i = 0; i < 499000; i++)
sb.append('\'');
String[] test = new String[1];
test[0] = sb.toString();
checkExtract(test);
}

private void checkExtract(String[] data) throws JSONException {
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
String css = data[0];
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
try {
Expand All @@ -87,10 +95,11 @@ private void checkExtract(String[] data) throws JSONException {

assertTrue(o instanceof JSONObject);
JSONObject jo = (JSONObject) o;
assertEquals(data[i],jo.getString("href"));
assertEquals("CSS link extraction failed for <" + css + ">",
data[i], jo.getString("href"));
}
} else {
assertNull(a);
assertNull("Expected no extracted link for <" + css + ">", a);
}
}

Expand Down

0 comments on commit eb66fc4

Please sign in to comment.