A utility class that parses a Comma Separated Values (CSV) file
/*
Java and XSLT
By Eric M.Burke
1st Edition September 2001
ISBN: 0-596-00143-6
*/
import java.io.IOException;
import java.util.*;
import org.xml.sax.*;
import java.io.*;
import java.net.URL;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
/**
* A utility class that parses a Comma Separated Values (CSV) file
* and outputs its contents using SAX2 events. The format of CSV that
* this class reads is identical to the export format for Microsoft
* Excel. For simple values, the CSV file may look like this:
*
* a,b,c
* d,e,f
*
* Quotes are used as delimiters when the values contain commas:
*
* a,"b,c",d
* e,"f,g","h,i"
*
* And double quotes are used when the values contain quotes. This parser
* is smart enough to trim spaces around commas, as well.
*
* @author Eric M. Burke
*/
public class CSVXMLReader extends AbstractXMLReader {
// an empty attribute for use with SAX
private static final Attributes EMPTY_ATTR = new AttributesImpl();
/**
* Parse a CSV file. SAX events are delivered to the ContentHandler
* that was registered via setContentHandler
.
*
* @param input the comma separated values file to parse.
*/
public void parse(InputSource input) throws IOException,
SAXException {
// if no handler is registered to receive events, don't bother
// to parse the CSV file
ContentHandler ch = getContentHandler();
if (ch == null) {
return;
}
// convert the InputSource into a BufferedReader
BufferedReader br = null;
if (input.getCharacterStream() != null) {
br = new BufferedReader(input.getCharacterStream());
} else if (input.getByteStream() != null) {
br = new BufferedReader(new InputStreamReader(
input.getByteStream()));
} else if (input.getSystemId() != null) {
java.net.URL url = new URL(input.getSystemId());
br = new BufferedReader(new InputStreamReader(url.openStream()));
} else {
throw new SAXException("Invalid InputSource object");
}
ch.startDocument();
// emit
ch.startElement("","","csvFile",EMPTY_ATTR);
// read each line of the file until EOF is reached
String curLine = null;
while ((curLine = br.readLine()) != null) {
curLine = curLine.trim();
if (curLine.length() > 0) {
// create the element
ch.startElement("","","line",EMPTY_ATTR);
// output data from this line
parseLine(curLine, ch);
// close the element
ch.endElement("","","line");
}
}
// emit
ch.endElement("","","csvFile");
ch.endDocument();
}
// Break an individual line into tokens. This is a recursive function
// that extracts the first token, then recursively parses the
// remainder of the line.
private void parseLine(String curLine, ContentHandler ch)
throws IOException, SAXException {
String firstToken = null;
String remainderOfLine = null;
int commaIndex = locateFirstDelimiter(curLine);
if (commaIndex > -1) {
firstToken = curLine.substring(0, commaIndex).trim();
remainderOfLine = curLine.substring(commaIndex+1).trim();
} else {
// no commas, so the entire line is the token
firstToken = curLine;
}
// remove redundant quotes
firstToken = cleanupQuotes(firstToken);
// emit the element
ch.startElement("","","value",EMPTY_ATTR);
ch.characters(firstToken.toCharArray(), 0, firstToken.length());
ch.endElement("","","value");
// recursively process the remainder of the line
if (remainderOfLine != null) {
parseLine(remainderOfLine, ch);
}
}
// locate the position of the comma, taking into account that
// a quoted token may contain ignorable commas.
private int locateFirstDelimiter(String curLine) {
if (curLine.startsWith("\"")) {
boolean inQuote = true;
int numChars = curLine.length();
for (int i=1; i