Skip to content

Commit c2cd7b4

Browse files
Rework using GenOptions to allow more control over "." matches
1 parent 4eb418c commit c2cd7b4

File tree

14 files changed

+513
-206
lines changed

14 files changed

+513
-206
lines changed

README.md

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* [How Does It Work?](#how-does-it-work)
88
* [The basics](#the-basics)
99
* [Exact matches vs. substring matches](#exact-matches-vs-substring-matches)
10+
* [What matches "dot"?](#what-matches-dot)
1011
* [How to create longer matching strings](#how-to-create-longer-matching-strings)
1112
* [How to generate random matches repeatably](#how-to-generate-random-matches-repeatably)
1213
* [How to limit match length to a specific range](#how-to-limit-match-length-to-a-specific-range)
@@ -84,7 +85,7 @@ l^Ȩú(¹C³TÂI6ÓQaª}f*Allô, world!p±!éÇ'>aDÙ
8485
```
8586

8687
These strings contain not only a substring that matches the regular expression, but also some
87-
extraneous characters before and/or after the match. If the consumer of these strings recognizes a
88+
[extraneous characters](#what-matches-dot) before and/or after the match. If the consumer of these strings recognizes a
8889
general "substring match", that's exactly what you want to give it.
8990

9091
But what if you need to generate only "exact" matches? In other words, strings containing only the matching characters.
@@ -108,6 +109,47 @@ Allô, world!
108109
Howdy, world!
109110
```
110111

112+
### What matches "dot"? ###
113+
114+
What matches the regular expression `.`? In general, any printable character. For a `RegExpGen`, by default, that means "any printable character
115+
from the Latin-1 basic and supplemental Unicode blocks". But what if that includes characters that your application is not designed to handle?
116+
You can define exactly what "any printable character" means using the `GenOptions` for a `RegExpGen`.
117+
118+
For example, you could make a ridiculously narrow definition like this:
119+
120+
```java
121+
...
122+
// Given a JavaScript regular expression...
123+
String regexp = regexp( "<< My secret is [^\\d\\s]{8,32} >>");
124+
125+
// ...and a random number generator...
126+
RandomGen random = getRandomGen();
127+
128+
// ...create a RegExpGen instance...
129+
RegExpGen generator = Parser.parseRegExp( regexp);
130+
131+
// ...matching "." with specific characters...
132+
generator.getOptions().setAnyPrintableChars( "1001 Anagrams!");
133+
...
134+
```
135+
136+
Run this example and the result will be something like this:
137+
138+
```
139+
Aag<< My secret is sgnggag!amm! >>s!Ag
140+
s11aa0Agra<< My secret is rsm!!nA!!Aanmnsgmmr >>m
141+
0m1ra! !n1gr 1<< My secret is mangAmr!!!m >>
142+
```
143+
144+
Notice how the implicit `.*` expressions that generate the beginning and ending of [substring
145+
matches](#exact-matches-vs-substring-matches) draw only from the characters specified by
146+
`GenOptions.setAnyPrintableChars()`. And notice something else: the definition of "any printable"
147+
also affects how matches are generated for exclusionary character classes of the form `[^...]`.
148+
That's because these classes are interpreted to mean "any printable char *except* for...".
149+
150+
For convenience, `GenOptions` defines some common candidates for "any printable", such as
151+
`GenOptions.ANY_LATIN_1` and `GenOptions.ANY_ASCII`.
152+
111153
### How to create longer matching strings ###
112154

113155
For any regular expression, there is always a miniumum length for any matching string. Sometimes
@@ -231,6 +273,7 @@ matches will always lie between the given limits. Instead, `RegExpGen` makes a s
231273
* **I'm getting strings with lots of crazy extra characters. What's the deal?**
232274

233275
You should read about the difference between ["exact" matches and "substring" matches](#exact-matches-vs-substring-matches).
276+
And if that's too crazy for you, try changing the definition of [what matches "dot"](#what-matches-dot).
234277

235278
* **Where's the Javadoc?**
236279

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
//////////////////////////////////////////////////////////////////////////////
2+
//
3+
// Copyright 2020, Cornutum Project
4+
// www.cornutum.org
5+
//
6+
//////////////////////////////////////////////////////////////////////////////
7+
8+
package org.cornutum.regexpgen;
9+
10+
import java.util.Arrays;
11+
import java.util.Collections;
12+
import java.util.List;
13+
import java.util.Set;
14+
import java.util.stream.IntStream;
15+
16+
import org.cornutum.regexpgen.util.ToString;
17+
18+
import static java.util.stream.Collectors.toSet;
19+
20+
/**
21+
* Defines options for generating regular expression matches.
22+
*/
23+
public class GenOptions
24+
{
25+
/**
26+
* Creates a new GenOptions instance.
27+
*/
28+
public GenOptions()
29+
{
30+
setAnyPrintableChars( ANY_LATIN_1);
31+
}
32+
33+
/**
34+
* Changes the set of characters used to generate matches for the "." expression.
35+
*/
36+
public void setAnyPrintableChars( Set<Character> chars)
37+
{
38+
anyPrintable_ = chars;
39+
}
40+
41+
/**
42+
* Changes the set of characters used to generate matches for the "." expression.
43+
*/
44+
public void setAnyPrintableChars( String chars)
45+
{
46+
setAnyPrintableChars(
47+
IntStream.range( 0, chars.length())
48+
.mapToObj( i -> chars.charAt( i))
49+
.collect( toSet()));
50+
}
51+
52+
/**
53+
* Returns the set of characters used to generate matches for the "." expression.
54+
*/
55+
public Set<Character> getAnyPrintableChars()
56+
{
57+
return anyPrintable_;
58+
}
59+
60+
public String toString()
61+
{
62+
return
63+
ToString.getBuilder( this)
64+
.toString();
65+
}
66+
67+
private Set<Character> anyPrintable_;
68+
69+
/**
70+
* Return true if the character with the given code point is printable.
71+
*/
72+
private static boolean isPrintable( int codePoint)
73+
{
74+
return
75+
Character.toChars( codePoint)[0] == ' '
76+
|| !(Character.isSpaceChar( codePoint) || notVisible_.contains( Character.getType( codePoint))) ;
77+
}
78+
79+
private static final List<Integer> notVisible_ =
80+
Arrays.asList(
81+
(int) Character.CONTROL,
82+
(int) Character.SURROGATE,
83+
(int) Character.UNASSIGNED);
84+
85+
private static Set<Character> printableChars( int startPoint, int endPoint)
86+
{
87+
return
88+
IntStream.range( startPoint, endPoint)
89+
.filter( GenOptions::isPrintable)
90+
.mapToObj( i -> Character.valueOf( (char) i))
91+
.collect( toSet());
92+
}
93+
94+
/**
95+
* All printable characters in the basic and supplemental Latin-1 code blocks
96+
*/
97+
public static final Set<Character> ANY_LATIN_1 = Collections.unmodifiableSet( printableChars( 0, 256));
98+
99+
/**
100+
* All printable characters in the ASCII code block
101+
*/
102+
public static final Set<Character> ANY_ASCII = Collections.unmodifiableSet( printableChars( 0, 128));
103+
}

src/main/java/org/cornutum/regexpgen/RegExpGen.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ public interface RegExpGen
2222
*/
2323
public int getMaxLength();
2424

25+
/**
26+
* Returns the {@link GenOptions options} for this generator.
27+
*/
28+
public GenOptions getOptions();
29+
2530
/**
2631
* Returns a random string within the given bounds that matches this regular expression.
2732
*/

src/main/java/org/cornutum/regexpgen/js/AbstractRegExpGen.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
package org.cornutum.regexpgen.js;
99

1010
import org.cornutum.regexpgen.Bounds;
11+
import org.cornutum.regexpgen.GenOptions;
1112
import org.cornutum.regexpgen.RandomGen;
1213
import org.cornutum.regexpgen.RegExpGen;
1314
import org.cornutum.regexpgen.util.ToString;
@@ -22,24 +23,25 @@ public abstract class AbstractRegExpGen implements RegExpGen
2223
/**
2324
* Creates a new AbstractRegExpGen instance.
2425
*/
25-
protected AbstractRegExpGen()
26+
protected AbstractRegExpGen( GenOptions options)
2627
{
27-
this( 1);
28+
this( options, 1);
2829
}
2930

3031
/**
3132
* Creates a new AbstractRegExpGen instance.
3233
*/
33-
protected AbstractRegExpGen( int length)
34+
protected AbstractRegExpGen( GenOptions options, int length)
3435
{
35-
this( length, length);
36+
this( options, length, length);
3637
}
3738

3839
/**
3940
* Creates a new AbstractRegExpGen instance.
4041
*/
41-
protected AbstractRegExpGen( Integer minOccur, Integer maxOccur)
42+
protected AbstractRegExpGen( GenOptions options, Integer minOccur, Integer maxOccur)
4243
{
44+
options_ = options;
4345
setOccurrences( minOccur, maxOccur);
4446
}
4547

@@ -83,6 +85,14 @@ public int getMaxOccur()
8385
return occurrences_.getMaxValue();
8486
}
8587

88+
/**
89+
* Returns the {@link GenOptions options} for this generator.
90+
*/
91+
public GenOptions getOptions()
92+
{
93+
return options_;
94+
}
95+
8696
/**
8797
* Returns a random string within the given bounds that matches this regular expression.
8898
*/
@@ -195,7 +205,10 @@ public int hashCode()
195205
private Bounds occurrences_;
196206
private boolean anchoredStart_ = false;
197207
private boolean anchoredEnd_ = false;
208+
private final GenOptions options_;
198209

210+
public static final GenOptions BUILDER_OPTIONS = new GenOptions();
211+
199212
/**
200213
* Builds an {@link AbstractRegExpGen} instance.
201214
*/

src/main/java/org/cornutum/regexpgen/js/AlternativeGen.java

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
package org.cornutum.regexpgen.js;
99

1010
import org.cornutum.regexpgen.Bounds;
11+
import org.cornutum.regexpgen.GenOptions;
1112
import org.cornutum.regexpgen.RandomGen;
1213
import org.cornutum.regexpgen.RegExpGen;
1314
import org.cornutum.regexpgen.util.ToString;
@@ -30,16 +31,17 @@ public class AlternativeGen extends AbstractRegExpGen
3031
/**
3132
* Creates a new AlternativeGen instance.
3233
*/
33-
public AlternativeGen()
34+
public AlternativeGen( GenOptions options)
3435
{
35-
super();
36+
super( options);
3637
}
3738

3839
/**
3940
* Creates a new AlternativeGen instance.
4041
*/
41-
public AlternativeGen( RegExpGen... members)
42+
public AlternativeGen( GenOptions options, RegExpGen... members)
4243
{
44+
this( options);
4345
for( RegExpGen member : members)
4446
{
4547
add( member);
@@ -49,8 +51,9 @@ public AlternativeGen( RegExpGen... members)
4951
/**
5052
* Creates a new AlternativeGen instance.
5153
*/
52-
public <T extends RegExpGen> AlternativeGen( Iterable<T> members)
54+
public <T extends RegExpGen> AlternativeGen( GenOptions options, Iterable<T> members)
5355
{
56+
this( options);
5457
for( RegExpGen member : members)
5558
{
5659
add( member);
@@ -262,6 +265,14 @@ public static Builder builder()
262265
return new Builder();
263266
}
264267

268+
/**
269+
* Returns an {@link AlternativeGen} builder.
270+
*/
271+
public static Builder builder( GenOptions options)
272+
{
273+
return new Builder( options);
274+
}
275+
265276
public String toString()
266277
{
267278
return
@@ -298,6 +309,16 @@ public int hashCode()
298309
*/
299310
public static class Builder extends BaseBuilder<Builder>
300311
{
312+
public Builder()
313+
{
314+
this( BUILDER_OPTIONS);
315+
}
316+
317+
public Builder( GenOptions options)
318+
{
319+
alternative_ = new AlternativeGen( options);
320+
}
321+
301322
/**
302323
* Returns the {@link AbstractRegExpGen} instance for this builder.
303324
*/
@@ -329,6 +350,6 @@ public AlternativeGen build()
329350
return alternative_;
330351
}
331352

332-
private AlternativeGen alternative_ = new AlternativeGen();
353+
private AlternativeGen alternative_;
333354
}
334355
}

0 commit comments

Comments
 (0)