Java Regex

An example of a tokenizer using the new JDK 1.4, 1.5 java.util.regex library


// Pattern to split into words separated by spaces or commas,
// ignoring null fields 
private static Pattern splitter = Pattern.compile ("[, ]++" ) ; 
... 
// Split phrase into words 
String[] words = splitter.split( phrase ); 

A bit bigger example...


import java.util.regex. Matcher; 
import java.util.regex. Pattern; 
... 
// simple ? wildcard: match ?gloss.html 
Pattern p = Pattern.compile( ".gloss\\.html" ); 

 // simple * wildcard: match g*.txt 
 Pattern p = Pattern.compile( "g.*\\.txt" ); 

 // match cat.html or dog.html 
Pattern p = Pattern.compile( "(cat|dog)\\.html" ) ; 

 // negative wildcards: match *.html but not cat.html or dog.html 
Pattern p = Pattern.compile( "(?!cat\\.html$|dog\\.html$).*\\.html" ) ; 

 // match email address of the form roedy.green_9@some-place.mind-prod.com 
// allow dots anywhere, but not at start of domain name, no + // lets two dots in row pass, and name to start with dot. 
Pattern p =
  Pattern.compile( "[a-z0-9\\-_\\.]++@[a-z0-9\\-]++(\\.[a-z0-9\\-]++)++" ); 

 // dealing with \ and . Match "r*\*.*". On command line would be "r.*\\.*\..*" 
Pattern p = Pattern.compile( "r.*\\\\.*\\..*" ) ; 

// Using Regex Capture Groups 
// search for strings of the form ...<title>...</title>... 
// and capture the string between the two tags. 
Pattern titleFinder = Pattern.compile( ".*\\<title>([a-zA-Z0-9 ]*)\\</title>.*" ); 

 Matcher m = titleFinder.matcher( "some stuff <title>Exploring Kenya</title> more stuff" ); 

 // prints matches? true 
System.out.println( "matches? " + m.matches() ); 

 // prints count: 1 
System.out.println( "count: " + m.groupCount() ); 

 // prints whole: some stuff <title>Exploring Kenya</title> more stuff 
System.out.println( "whole: " + m.group( 0 )) ; 

 // prints captured title: Exploring Kenya 
System.out.println( "captured title: " + m.group( 1 )) ; 

Matching vs Finding When you want the entire String to match your Pattern, you use Matcher.matches. When you want to find fragments in your String that match the Pattern, use Matcher.find.


// find stuff between <td> ... </td> tags 
// prints: orca pilot whale 
Pattern p = Pattern.compile( "\\<td>([^\\<>]++)\\</td>" ); 
Matcher m = p.matcher( "dolphin <td>orca</td> junk\n" 
  + "<td></td> empty" 
  + "<td>pilot whale</td> beluga\n" 
  ); 
while ( m.find () ) { 
  int gc = m.groupCount(); 
  // group 0 is the whole pattern 
   // run from 1 to gc, not 0 to gc-1 as is traditional. 
  for ( int i=1 ; i<=gc ; i++ ) { 
    System.out.println( m.group( i )) ; 
  }
}