Mega Code Archive

Reads CSV (Comma Separated Value) files

/*------------------------------------------------------------------------------ Name: CSVReader.java Project: jutils.org Comment: Reads CSV (Comma Separated Value) files Version: $Id: CSVReader.java,v 1.1 2004/04/07 07:40:45 laurent Exp $ Author: Roedy Green roedy@mindprod.com, Heinrich Goetzger goetzger@gmx.net ------------------------------------------------------------------------------*/ import java.util.Vector; import java.io.BufferedReader; import java.io.EOFException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; /** * Reads CSV (Comma Separated Value) files. * * This format is mostly used my Microsoft Word and Excel. * Fields are separated by commas, and enclosed in * quotes if they contain commas or quotes. * Embedded quotes are doubled. * Embedded spaces do not normally require surrounding quotes. * The last field on the line is not followed by a comma. * Null fields are represented by two commas in a row. * We ignore leading and trailing spaces on fields, even inside quotes. * * @author copyright (c) 2002 Roedy Green Canadian Mind Products * Roedy posted this code on Newsgroups:comp.lang.java.programmer on 27th March 2002. * * Heinrich added some stuff like comment ability and linewise working. * */ public class CSVReader { /** * Constructor * * @param r input Reader source of CSV Fields to read. * @param separator * field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for tab. */ public CSVReader (Reader r, char separator) { /* convert Reader to BufferedReader if necessary */ if ( r instanceof BufferedReader ) { this.r = (BufferedReader) r; } else { this.r = new BufferedReader(r); } this.separator = separator; } // end of CSVReader /** * Constructor with default field separator ','. * * @param r input Reader source of CSV Fields to read. */ public CSVReader (Reader r) { /* convert Reader to BufferedReader if necessary */ if ( r instanceof BufferedReader ) { this.r = (BufferedReader) r; } else { this.r = new BufferedReader(r); } this.separator = ','; } // end of CSVReader private static final boolean debugging = true; /** * Reader source of the CSV fields to be read. */ private BufferedReader r; /* * field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for tab. */ private char separator; /** * category of end of line char. */ private static final int EOL = 0; /** * category of ordinary character */ private static final int ORDINARY = 1; /** * categotory of the quote mark " */ private static final int QUOTE = 2; /** * category of the separator, e.g. comma, semicolon * or tab. */ private static final int SEPARATOR = 3; /** * category of characters treated as white space. */ private static final int WHITESPACE = 4; /** * categorise a character for the finite state machine. * * @param c the character to categorise * @return integer representing the character's category. */ private int categorise ( char c ) { switch ( c ) { case ' ': case '\r': case 0xff: return WHITESPACE; // case ';': // case '!': case '#': //return EOL; case '\n': return EOL; /* artificially applied to end of line */ case '\"': return QUOTE; default: if (c == separator) { /* dynamically determined so can't use as case label */ return SEPARATOR; } else if ( '!' <= c && c <= '~' ) { /* do our tests in crafted order, hoping for an early return */ return ORDINARY; } else if ( 0x00 <= c && c <= 0x20 ) { return WHITESPACE; } else if ( Character.isWhitespace(c) ) { return WHITESPACE; } else { return ORDINARY; } } // end of switch } // end of categorise /** * parser: We are in blanks before the field. */ private static final int SEEKINGSTART = 0; /** * parser: We are in the middle of an ordinary field. */ private static final int INPLAIN = 1; /** * parser: e are in middle of field surrounded in quotes. */ private static final int INQUOTED = 2; /** * parser: We have just hit a quote, might be doubled * or might be last one. */ private static final int AFTERENDQUOTE = 3; /** * parser: We are in blanks after the field looking for the separator */ private static final int SKIPPINGTAIL = 4; /** * state of the parser's finite state automaton. */ /** * The line we are parsing. * null means none read yet. * Line contains unprocessed chars. Processed ones are removed. */ private String line = null; /** * How many lines we have read so far. * Used in error messages. */ private int lineCount = 0; public String[] getLine() { Vector lineArray = new Vector(); String token = null; String returnArray [] = null; // reading values from line until null comes try { while (lineArray.size() == 0) { while ( (token = get() ) != null ) { lineArray.add(token); } // end of while } // end of while } catch (EOFException e) { return null; } catch (IOException e) { } returnArray = new String[lineArray.size()]; for(int ii=0; ii < lineArray.size(); ii++) { returnArray[ii] = lineArray.elementAt(ii).toString(); } // end of for return returnArray; } /** * Read one field from the CSV file * * @return String value, even if the field is numeric. Surrounded * and embedded double quotes are stripped. * possibly "". null means end of line. * * @exception EOFException * at end of file after all the fields have * been read. * * @exception IOException * Some problem reading the file, possibly malformed data. */ private String get() throws EOFException, IOException { StringBuffer field = new StringBuffer(50); /* we implement the parser as a finite state automaton with five states. */ readLine(); int state = SEEKINGSTART; /* start seeking, even if partway through a line */ /* don't need to maintain state between fields. */ /* loop for each char in the line to find a field */ /* guaranteed to leave early by hitting EOL */ for ( int i=0; i<line.length(); i++ ) { char c = line.charAt(i); int category = categorise(c); switch ( state ) { case SEEKINGSTART: { /* in blanks before field */ switch ( category ) { case WHITESPACE: /* ignore */ break; case QUOTE: state = INQUOTED; break; case SEPARATOR: /* end of empty field */ line = line.substring(i+1); return ""; case EOL: /* end of line */ line = null; return null; case ORDINARY: field.append(c); state = INPLAIN; break; } break; } // end of SEEKINGSTART case INPLAIN: { /* in middle of ordinary field */ switch ( category ) { case QUOTE: throw new IOException("Malformed CSV stream. Missing quote at start of field on line " + lineCount); case SEPARATOR: /* done */ line = line.substring(i+1); return field.toString().trim(); case EOL: line = line.substring(i); /* push EOL back */ return field.toString().trim(); case WHITESPACE: field.append(' '); break; case ORDINARY: field.append(c); break; } break; } // end of INPLAIN case INQUOTED: { /* in middle of field surrounded in quotes */ switch ( category ) { case QUOTE: state = AFTERENDQUOTE; break; case EOL: throw new IOException ("Malformed CSV stream. Missing quote after field on line "+lineCount); case WHITESPACE: field.append(' '); break; case SEPARATOR: case ORDINARY: field.append(c); break; } break; } // end of INQUOTED case AFTERENDQUOTE: { /* In situation like this "xxx" which may turn out to be xxx""xxx" or "xxx", We find out here. */ switch ( category ) { case QUOTE: field.append(c); state = INQUOTED; break; case SEPARATOR : /* we are done.*/ line = line.substring(i+1); return field.toString().trim(); case EOL: line = line.substring(i); /* push back eol */ return field.toString().trim(); case WHITESPACE: /* ignore trailing spaces up to separator */ state = SKIPPINGTAIL; break; case ORDINARY: throw new IOException("Malformed CSV stream, missing separator after field on line " + lineCount); } break; } // end of AFTERENDQUOTE case SKIPPINGTAIL: { /* in spaces after field seeking separator */ switch ( category ) { case SEPARATOR : /* we are done.*/ line = line.substring(i+1); return field.toString().trim(); case EOL: line = line.substring(i); /* push back eol */ return field.toString().trim(); case WHITESPACE: /* ignore trailing spaces up to separator */ break; case QUOTE: case ORDINARY: throw new IOException("Malformed CSV stream, missing separator after field on line " + lineCount); } // end of switch break; } // end of SKIPPINGTAIL } // end switch(state) } // end for throw new IOException("Program logic bug. Should not reach here. Processing line " + lineCount); } // end get /** * Make sure a line is available for parsing. * Does nothing if there already is one. * * @exception EOFException */ private void readLine() throws EOFException, IOException { if ( line == null ) { line = r.readLine(); /* this strips platform specific line ending */ if ( line == null ) { /* null means EOF, yet another inconsistent Java convention. */ throw new EOFException(); } else { line += '\n'; /* apply standard line end for parser to find */ lineCount++; } } } // end of readLine /** * Skip over fields you don't want to process. * * @param fields How many field you want to bypass reading. * The newline counts as one field. * @exception EOFException * at end of file after all the fields have * been read. * @exception IOException * Some problem reading the file, possibly malformed data. */ public void skip(int fields) throws EOFException, IOException { if ( fields <= 0 ) { return; } for ( int i=0; i<fields; i++ ) { // throw results away get(); } } // end of skip /** * Skip over remaining fields on this line you don't want to process. * * @exception EOFException * at end of file after all the fields have * been read. * @exception IOException * Some problem reading the file, possibly malformed data. */ public void skipToNextLine() throws EOFException, IOException { if ( line == null ) { readLine(); } line = null; } // end of skipToNextLine /** * Close the Reader. */ public void close() throws IOException { if ( r != null ) { r.close(); r = null; } } // end of close /** * @param args [0]: The name of the file. */ private static void testSingleTokens(String[] args) { if ( debugging ) { try { // read test file CSVReader csv = new CSVReader(new FileReader(args[0]), ','); try { while ( true ) { System.out.println(csv.get()); } } catch ( EOFException e ) { } csv.close(); } catch ( IOException e ) { e.printStackTrace(); System.out.println(e.getMessage()); } } // end if } // end of testSingleTokens /** * @param args [0]: The name of the file. */ private static void testLines(String[] args) { int lineCounter = 0; String loadLine[] = null; String DEL = ","; if ( debugging ) { try { // read test file CSVReader csv = new CSVReader(new FileReader(args[0]), ','); while( (loadLine = csv.getLine()) != null) { lineCounter++; StringBuffer logBuffer = new StringBuffer(); String logLine; //log.debug("#" + lineCounter +" : '" + loadLine.length + "'"); logBuffer.append(loadLine[0]); // write first token, then write DEL in loop and the whole rest. for(int i=1; i < loadLine.length; i++) { logBuffer.append(DEL).append(loadLine[i]); } logLine = logBuffer.toString(); logLine.substring(0, logLine.lastIndexOf(DEL)); //logLine.delete(logLine.lastIndexOf(DEL), logLine.length()); // is supported since JDK 1.4 //System.out.println("#" + lineCounter +" : '" + loadLine.length + "' " + logLine); System.out.println(logLine); } // end of while csv.close(); } catch ( IOException e ) { e.printStackTrace(); System.out.println(e.getMessage()); } } // end if } // end of testLines /** * Test driver * * @param args [0]: The name of the file. */ static public void main(String[] args) { //testSingleTokens(args); testLines(args); } // end main } // end CSVReader // end of file