import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Arrays.*; public class Tokenizer { String fileContent = new String(""); int pos = 0; public Tokenizer(String fileName) throws FileNotFoundException, UnsupportedEncodingException, IOException { // Constructor - reads and loads in memory. char[] cbuf = new char[200]; int charsRead; InputStreamReader isr = new InputStreamReader(new FileInputStream(fileName), "UTF-8"); while ((charsRead = isr.read(cbuf, 0, 200)) != -1) { fileContent += new String(java.util.Arrays.copyOfRange(cbuf, 0, charsRead)); } isr.close(); // System.out.println(fileContent); } public String nextToken() { String tok = ""; char c; boolean last_was_quote = false; try { while (! Character.isLetterOrDigit(fileContent .subSequence(pos, pos+1) .charAt(0))) { pos++; } c = fileContent.subSequence(pos,pos+1).charAt(0); while (Character.isLetterOrDigit(c) || (last_was_quote = (c == '\''))) { tok += fileContent.substring(pos,pos+1); pos++; c = fileContent.subSequence(pos,pos+1).charAt(0); } if (last_was_quote) { // Remove ending quote while (tok.charAt(tok.length()-1) == '\'') { tok = tok.substring(0, tok.length()-1); } } } catch (IndexOutOfBoundsException e) { return null; } return tok.toLowerCase(); } }