An example of a tokenizer using the new JDK 1.4, 1.5 java.util.regex library
// Pattern to split into words separated by spaces or commas,
// ignoring null fields
private static Pattern splitter = Pattern.compile ("[, ]++" ) ;
...
// Split phrase into words
String[] words = splitter.split( phrase );
A bit bigger example...
import java.util.regex. Matcher; import java.util.regex. Pattern; ... // simple ? wildcard: match ?gloss.html Pattern p = Pattern.compile( ".gloss\\.html" ); // simple * wildcard: match g*.txt Pattern p = Pattern.compile( "g.*\\.txt" ); // match cat.html or dog.html Pattern p = Pattern.compile( "(cat|dog)\\.html" ) ; // negative wildcards: match *.html but not cat.html or dog.html Pattern p = Pattern.compile( "(?!cat\\.html$|dog\\.html$).*\\.html" ) ; // match email address of the form roedy.green_9@some-place.mind-prod.com // allow dots anywhere, but not at start of domain name, no + // lets two dots in row pass, and name to start with dot. Pattern p = Pattern.compile( "[a-z0-9\\-_\\.]++@[a-z0-9\\-]++(\\.[a-z0-9\\-]++)++" ); // dealing with \ and . Match "r*\*.*". On command line would be "r.*\\.*\..*" Pattern p = Pattern.compile( "r.*\\\\.*\\..*" ) ; // Using Regex Capture Groups // search for strings of the form ...<title>...</title>... // and capture the string between the two tags. Pattern titleFinder = Pattern.compile( ".*\\<title>([a-zA-Z0-9 ]*)\\</title>.*" ); Matcher m = titleFinder.matcher( "some stuff <title>Exploring Kenya</title> more stuff" ); // prints matches? true System.out.println( "matches? " + m.matches() ); // prints count: 1 System.out.println( "count: " + m.groupCount() ); // prints whole: some stuff <title>Exploring Kenya</title> more stuff System.out.println( "whole: " + m.group( 0 )) ; // prints captured title: Exploring Kenya System.out.println( "captured title: " + m.group( 1 )) ;
Matching vs Finding When you want the entire String to match your Pattern, you use Matcher.matches. When you want to find fragments in your String that match the Pattern, use Matcher.find.
// find stuff between <td> ... </td> tags
// prints: orca pilot whale
Pattern p = Pattern.compile( "\\<td>([^\\<>]++)\\</td>" );
Matcher m = p.matcher( "dolphin <td>orca</td> junk\n"
+ "<td></td> empty"
+ "<td>pilot whale</td> beluga\n"
);
while ( m.find () ) {
int gc = m.groupCount();
// group 0 is the whole pattern
// run from 1 to gc, not 0 to gc-1 as is traditional.
for ( int i=1 ; i<=gc ; i++ ) {
System.out.println( m.group( i )) ;
}
}