Logo Search packages:      
Sourcecode: pauker version File versions  Download package

CsvParser.java

/*
 * CSVParser.java
 *
 * Created on 11. April 2006, 22:56
 *
 */

package tools;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
 * A somewhat tolerant parser for CSV files.
 * @author Ronny.Standtke@gmx.net
 */
00020 public final class CsvParser {

    private CsvParser() {
        // singleton
    }

    /**
     * parses a CSV grammar inputstream
     *
     * CSV grammar:
     * ------------
     * csvFile ::= (csvLine)* [csvStringList] 'EOF'
     * csvLine ::= csvStringList newLine
     * newLine ::= '\n' | '\r' | '\r\n'
     * csvStringList ::= csvString (',' csvString)*
     * csvString := whitespace* [csvField whitespace*]
     * whitespace ::= ' ' | '\t'
     * csvField ::= simpleField | quotedField
     * simpleField ::= (any char except newLine, EOF, \t, space, comma or double quote)+
     * quotedField ::= '"' subField ('"' '"' subField)* '"'
     * subField ::= (any char except double quote or EOF)+
     *
     * @param reader the character stream where to read the CSV data from
     * @throws IOException if an Exception with IO or CSV syntax is thrown
     * @return a list of CSV lines that in turn contain a list of strings
     */
00046     public static List<List<String>> parseCsvFile(Reader reader)
            throws IOException {
        List<List<String>> csvFile = new ArrayList<List<String>>();
        // we need a pushback buffer size of 2
        PushbackReader pushbackReader = new PushbackReader(reader, 2);

        // Unfortunately, we first have to check, if there is a BOM at the
        // beginning.
        // see http://de.wikipedia.org/wiki/Byte_Order_Mark
        // and http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058
        int bomTest = pushbackReader.read();
        if (bomTest != '\uFEFF') {
            // OK, no BOM, let's push it back
            pushbackReader.unread(bomTest);
        }

        for (int lineCounter = 1, currentChar = pushbackReader.read();
                currentChar != -1;) {
            pushbackReader.unread(currentChar);
            List<String> csvLine = parseCsvLine(pushbackReader, lineCounter);
            csvFile.add(csvLine);
            lineCounter++;
            currentChar = pushbackReader.read();
        }
        return csvFile;
    }

    private static List<String> parseCsvLine(PushbackReader pushbackReader,
            int lineCounter) throws IOException {
        List<String> csvLine = new ArrayList<String>();

        // parse all strings at this line
        int column = 1;
        while (true) {
            String csvString = parseCsvString(pushbackReader, lineCounter);
            // System.out.println("csvString: \"" + csvString + "\"");
            csvLine.add(csvString);
            // to be syntactically correct there must follow a
            // newline, EOF or a comma
            if (endOfLineFollows(pushbackReader)) {
                consumeEndOfLine(pushbackReader);
                break;
            }
            int currentChar = pushbackReader.read();
            if (currentChar == -1) {
                break;
            }
            if (currentChar != ',') {
                // syntax error!
                throw new IOException("missing comma after column " + column +
                        " of line " + lineCounter + " (found character \"" +
                        (char) currentChar + "\" == " + currentChar + ")");
            }
            column++;
        }

        return csvLine;
    }

    private static String parseCsvString(PushbackReader pushbackReader,
            int lineCounter) throws IOException {

        consumeOptionalWhiteSpaces(pushbackReader);

        // parse CSV field
        if (endOfLineFollows(pushbackReader)) {
            // this was an empty field
            return null;
        }
        int currentChar = pushbackReader.read();
        if (currentChar == ',') {
            // this was an empty field
            pushbackReader.unread(currentChar);
            return null;
        }

        String csvString = null;
        if (currentChar == '"') {
            // this is a quoted field
            csvString = parseQuotedCsvString(pushbackReader, lineCounter);
        } else {
            // this is a simple field
            csvString = parseSimpleCsvString(currentChar, pushbackReader);
        }

        consumeOptionalWhiteSpaces(pushbackReader);

        return csvString;
    }

    private static String parseSimpleCsvString(int currentChar,
            PushbackReader pushbackReader) throws IOException {

        StringBuilder stringBuilder = new StringBuilder();
        while (true) {
            stringBuilder.append((char) currentChar);
            if (endOfLineFollows(pushbackReader)) {
                break;
            }
            currentChar = pushbackReader.read();
            if (currentChar == ',') {
                pushbackReader.unread(currentChar);
                break;
            }
            if (currentChar == -1) {
                break;
            }
        // In contrast to the grammar definition above we allow double quotes
        // and spaces here (because OpenOffice allows them too).
        // This is a basic principle: We are tolerant when parsing
        // but very exact when writing a format.
        }

        // remove trailing whitespaces before return
        return stringBuilder.toString().trim();
    }

    private static String parseQuotedCsvString(PushbackReader pushbackReader,
            int lineCounter) throws IOException {

        // the opening quote is already read
        StringBuilder stringBuilder = new StringBuilder();

        // read quoted CSV string
        while (true) {
            if (endOfLineFollows(pushbackReader)) {
                consumeEndOfLine(pushbackReader);
                stringBuilder.append("\n");
            }
            int currentChar = pushbackReader.read();
            if (currentChar == -1) {
                break;
            }
            if (currentChar == '"') {
                // un-escape double quotes
                currentChar = pushbackReader.read();
                if (currentChar == '"') {
                    // this was an escaped double quote
                    stringBuilder.append((char) currentChar);
                } else {
                    // The double quote was not escaped but it was the closing
                    // quote of this CSV string.
                    if (currentChar != -1) {
                        // We must unread the currentChar.
                        pushbackReader.unread(currentChar);
                    }
                    pushbackReader.unread('"');
                    break;
                }
            } else {
                stringBuilder.append((char) currentChar);
            }
        }

        // grammar check
        int currentChar = pushbackReader.read();
        if (currentChar != '"') {
            throw new IOException("missing terminating double quote in CSV " +
                    "line " + lineCounter);
        }

        return stringBuilder.toString();
    }

    private static void consumeOptionalWhiteSpaces(
            PushbackReader pushbackReader) throws IOException {
        int currentChar = pushbackReader.read();
        while (isSpace(currentChar)) {
            currentChar = pushbackReader.read();
        }
        if (currentChar != -1) {
            pushbackReader.unread(currentChar);
        }
    }

    private static boolean endOfLineFollows(PushbackReader pushbackReader)
            throws IOException {
        int currentChar = pushbackReader.read();
        if (currentChar != -1) {
            pushbackReader.unread(currentChar);
        }
        boolean endOfLineFollows = currentChar == '\n' || currentChar == '\r';
        // System.out.println("endOfLineFollows: " + endOfLineFollows);
        return endOfLineFollows;
    }

    private static void consumeEndOfLine(PushbackReader pushbackReader)
            throws IOException {
        int currentChar = pushbackReader.read();
        if (currentChar == '\n') {
            // System.out.println("consumed standard UNIX newline");
        } else if (currentChar == '\r') {
            // check next character too because of M$'s braindead '\r\n' newline
            currentChar = pushbackReader.read();
            if (currentChar != '\n') {
                // System.out.println("this was a single byte Mac newline");
                // (must push back the byte that we read too much)
                pushbackReader.unread(currentChar);
            } else {
                // System.out.println("this was a M$ newline");
            }
        } else {
            throw new IOException("there was no newline here");
        }
    }

    private static boolean isSpace(int currentChar) {
        return (currentChar == ' ' || currentChar == '\t');
    }
}

Generated by  Doxygen 1.6.0   Back to index