1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package com.csvreader;
22
23 import java.io.BufferedReader;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.InputStreamReader;
30 import java.io.Reader;
31 import java.io.StringReader;
32 import java.nio.charset.Charset;
33 import java.text.NumberFormat;
34 import java.util.HashMap;
35
36 /***
37 * A stream based parser for parsing delimited text data from a file or a
38 * stream.
39 */
40 public class CsvReader {
41 private Reader inputStream = null;
42
43 private String fileName = null;
44
45
46 private UserSettings userSettings = new UserSettings();
47
48 private Charset charset = null;
49
50 private boolean useCustomRecordDelimiter = false;
51
52
53
54
55 private DataBuffer dataBuffer = new DataBuffer();
56
57 private ColumnBuffer columnBuffer = new ColumnBuffer();
58
59 private RawRecordBuffer rawBuffer = new RawRecordBuffer();
60
61 private boolean[] isQualified = null;
62
63 private String rawRecord = "";
64
65 private HeadersHolder headersHolder = new HeadersHolder();
66
67
68
69
70
71 private boolean startedColumn = false;
72
73 private boolean startedWithQualifier = false;
74
75 private boolean hasMoreData = true;
76
77 private char lastLetter = '\0';
78
79 private boolean hasReadNextLine = false;
80
81 private int columnsCount = 0;
82
83 private long currentRecord = 0;
84
85 private String[] values = new String[StaticSettings.INITIAL_COLUMN_COUNT];
86
87 private boolean initialized = false;
88
89 private boolean closed = false;
90
91 /***
92 * Double up the text qualifier to represent an occurance of the text
93 * qualifier.
94 */
95 public static final int ESCAPE_MODE_DOUBLED = 1;
96
97 /***
98 * Use a backslash character before the text qualifier to represent an
99 * occurance of the text qualifier.
100 */
101 public static final int ESCAPE_MODE_BACKSLASH = 2;
102
103 /***
104 * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
105 * as the data source.
106 *
107 * @param fileName
108 * The path to the file to use as the data source.
109 * @param delimiter
110 * The character to use as the column delimiter.
111 * @param charset
112 * The {@link java.nio.charset.Charset Charset} to use while
113 * parsing the data.
114 */
115 public CsvReader(String fileName, char delimiter, Charset charset) throws FileNotFoundException {
116 if (fileName == null) {
117 throw new IllegalArgumentException("Parameter fileName can not be null.");
118 }
119
120 if (charset == null) {
121 throw new IllegalArgumentException("Parameter charset can not be null.");
122 }
123
124 if (!new File(fileName).exists()) {
125 throw new FileNotFoundException("File " + fileName + " does not exist.");
126 }
127
128 this.fileName = fileName;
129 this.userSettings.Delimiter = delimiter;
130 this.charset = charset;
131
132 isQualified = new boolean[values.length];
133 }
134
135 /***
136 * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
137 * as the data source. Uses ISO-8859-1 as the
138 * {@link java.nio.charset.Charset Charset}.
139 *
140 * @param fileName
141 * The path to the file to use as the data source.
142 * @param delimiter
143 * The character to use as the column delimiter.
144 */
145 public CsvReader(String fileName, char delimiter) throws FileNotFoundException {
146 this(fileName, delimiter, Charset.forName("ISO-8859-1"));
147 }
148
149 /***
150 * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
151 * as the data source. Uses a comma as the column delimiter and
152 * ISO-8859-1 as the {@link java.nio.charset.Charset Charset}.
153 *
154 * @param fileName
155 * The path to the file to use as the data source.
156 */
157 public CsvReader(String fileName) throws FileNotFoundException {
158 this(fileName, Letters.COMMA);
159 }
160
161 /***
162 * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a
163 * {@link java.io.Reader Reader} object as the data source.
164 *
165 * @param inputStream
166 * The stream to use as the data source.
167 * @param delimiter
168 * The character to use as the column delimiter.
169 */
170 public CsvReader(Reader inputStream, char delimiter) {
171 if (inputStream == null) {
172 throw new IllegalArgumentException("Parameter inputStream can not be null.");
173 }
174
175 this.inputStream = inputStream;
176 this.userSettings.Delimiter = delimiter;
177 initialized = true;
178
179 isQualified = new boolean[values.length];
180 }
181
182 /***
183 * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a
184 * {@link java.io.Reader Reader} object as the data source. Uses a
185 * comma as the column delimiter.
186 *
187 * @param inputStream
188 * The stream to use as the data source.
189 */
190 public CsvReader(Reader inputStream) {
191 this(inputStream, Letters.COMMA);
192 }
193
194 /***
195 * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an
196 * {@link java.io.InputStream InputStream} object as the data source.
197 *
198 * @param inputStream
199 * The stream to use as the data source.
200 * @param delimiter
201 * The character to use as the column delimiter.
202 * @param charset
203 * The {@link java.nio.charset.Charset Charset} to use while
204 * parsing the data.
205 */
206 public CsvReader(InputStream inputStream, char delimiter, Charset charset) {
207 this(new InputStreamReader(inputStream, charset), delimiter);
208 }
209
210 /***
211 * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an
212 * {@link java.io.InputStream InputStream} object as the data
213 * source. Uses a comma as the column delimiter.
214 *
215 * @param inputStream
216 * The stream to use as the data source.
217 * @param charset
218 * The {@link java.nio.charset.Charset Charset} to use while
219 * parsing the data.
220 */
221 public CsvReader(InputStream inputStream, Charset charset) {
222 this(new InputStreamReader(inputStream, charset));
223 }
224
225 public boolean getCaptureRawRecord() {
226 return userSettings.CaptureRawRecord;
227 }
228
229 public void setCaptureRawRecord(boolean captureRawRecord) {
230 userSettings.CaptureRawRecord = captureRawRecord;
231 }
232
233 public String getRawRecord() {
234 return rawRecord;
235 }
236
237 /***
238 * Gets whether leading and trailing whitespace characters are being trimmed
239 * from non-textqualified column data. Default is true.
240 *
241 * @return Whether leading and trailing whitespace characters are being
242 * trimmed from non-textqualified column data.
243 */
244 public boolean getTrimWhitespace() {
245 return userSettings.TrimWhitespace;
246 }
247
248 /***
249 * Sets whether leading and trailing whitespace characters should be trimmed
250 * from non-textqualified column data or not. Default is true.
251 *
252 * @param trimWhitespace
253 * Whether leading and trailing whitespace characters should
254 * be trimmed from non-textqualified column data or not.
255 */
256 public void setTrimWhitespace(boolean trimWhitespace) {
257 userSettings.TrimWhitespace = trimWhitespace;
258 }
259
260 /***
261 * Gets the character being used as the column delimiter. Default is comma,
262 * ','.
263 *
264 * @return The character being used as the column delimiter.
265 */
266 public char getDelimiter() {
267 return userSettings.Delimiter;
268 }
269
270 /***
271 * Sets the character to use as the column delimiter. Default is comma, ','.
272 *
273 * @param delimiter
274 * The character to use as the column delimiter.
275 */
276 public void setDelimiter(char delimiter) {
277 userSettings.Delimiter = delimiter;
278 }
279
280 public char getRecordDelimiter() {
281 return userSettings.RecordDelimiter;
282 }
283
284 /***
285 * Sets the character to use as the record delimiter.
286 *
287 * @param recordDelimiter
288 * The character to use as the record delimiter. Default is
289 * combination of standard end of line characters for
290 * Windows, Unix, or Mac.
291 */
292 public void setRecordDelimiter(char recordDelimiter) {
293 useCustomRecordDelimiter = true;
294 userSettings.RecordDelimiter = recordDelimiter;
295 }
296
297 /***
298 * Gets the character to use as a text qualifier in the data.
299 *
300 * @return The character to use as a text qualifier in the data.
301 */
302 public char getTextQualifier() {
303 return userSettings.TextQualifier;
304 }
305
306 /***
307 * Sets the character to use as a text qualifier in the data.
308 *
309 * @param textQualifier
310 * The character to use as a text qualifier in the data.
311 */
312 public void setTextQualifier(char textQualifier) {
313 userSettings.TextQualifier = textQualifier;
314 }
315
316 /***
317 * Whether text qualifiers will be used while parsing or not.
318 *
319 * @return Whether text qualifiers will be used while parsing or not.
320 */
321 public boolean getUseTextQualifier() {
322 return userSettings.UseTextQualifier;
323 }
324
325 /***
326 * Sets whether text qualifiers will be used while parsing or not.
327 *
328 * @param useTextQualifier
329 * Whether to use a text qualifier while parsing or not.
330 */
331 public void setUseTextQualifier(boolean useTextQualifier) {
332 userSettings.UseTextQualifier = useTextQualifier;
333 }
334
335 /***
336 * Gets the character being used as a comment signal.
337 *
338 * @return The character being used as a comment signal.
339 */
340 public char getComment() {
341 return userSettings.Comment;
342 }
343
344 /***
345 * Sets the character to use as a comment signal.
346 *
347 * @param comment
348 * The character to use as a comment signal.
349 */
350 public void setComment(char comment) {
351 userSettings.Comment = comment;
352 }
353
354 /***
355 * Gets whether comments are being looked for while parsing or not.
356 *
357 * @return Whether comments are being looked for while parsing or not.
358 */
359 public boolean getUseComments() {
360 return userSettings.UseComments;
361 }
362
363 /***
364 * Sets whether comments are being looked for while parsing or not.
365 *
366 * @param useComments
367 * Whether comments are being looked for while parsing or
368 * not.
369 */
370 public void setUseComments(boolean useComments) {
371 userSettings.UseComments = useComments;
372 }
373
374 /***
375 * Gets the current way to escape an occurance of the text qualifier inside
376 * qualified data.
377 *
378 * @return The current way to escape an occurance of the text qualifier
379 * inside qualified data.
380 */
381 public int getEscapeMode() {
382 return userSettings.EscapeMode;
383 }
384
385 /***
386 * Sets the current way to escape an occurance of the text qualifier inside
387 * qualified data.
388 *
389 * @param escapeMode
390 * The way to escape an occurance of the text qualifier
391 * inside qualified data.
392 * @exception IllegalArgumentException
393 * When an illegal value is specified for escapeMode.
394 */
395 public void setEscapeMode(int escapeMode) throws IllegalArgumentException {
396 if (escapeMode != ESCAPE_MODE_DOUBLED && escapeMode != ESCAPE_MODE_BACKSLASH) {
397 throw new IllegalArgumentException("Parameter escapeMode must be a valid value.");
398 }
399
400 userSettings.EscapeMode = escapeMode;
401 }
402
403 public boolean getSkipEmptyRecords() {
404 return userSettings.SkipEmptyRecords;
405 }
406
407 public void setSkipEmptyRecords(boolean skipEmptyRecords) {
408 userSettings.SkipEmptyRecords = skipEmptyRecords;
409 }
410
411 /***
412 * Safety caution to prevent the parser from using large amounts of memory
413 * in the case where parsing settings like file encodings don't end up
414 * matching the actual format of a file. This switch can be turned off if
415 * the file format is known and tested. With the switch off, the max column
416 * lengths and max column count per record supported by the parser will
417 * greatly increase. Default is true.
418 *
419 * @return The current setting of the safety switch.
420 */
421 public boolean getSafetySwitch() {
422 return userSettings.SafetySwitch;
423 }
424
425 /***
426 * Safety caution to prevent the parser from using large amounts of memory
427 * in the case where parsing settings like file encodings don't end up
428 * matching the actual format of a file. This switch can be turned off if
429 * the file format is known and tested. With the switch off, the max column
430 * lengths and max column count per record supported by the parser will
431 * greatly increase. Default is true.
432 *
433 * @param safetySwitch
434 */
435 public void setSafetySwitch(boolean safetySwitch) {
436 userSettings.SafetySwitch = safetySwitch;
437 }
438
439 /***
440 * Gets the count of columns found in this record.
441 *
442 * @return The count of columns found in this record.
443 */
444 public int getColumnCount() {
445 return columnsCount;
446 }
447
448 /***
449 * Gets the index of the current record.
450 *
451 * @return The index of the current record.
452 */
453 public long getCurrentRecord() {
454 return currentRecord - 1;
455 }
456
457 /***
458 * Gets the count of headers read in by a previous call to
459 * {@link com.csvreader.CsvReader#readHeaders readHeaders()}.
460 *
461 * @return The count of headers read in by a previous call to
462 * {@link com.csvreader.CsvReader#readHeaders readHeaders()}.
463 */
464 public int getHeaderCount() {
465 return headersHolder.Length;
466 }
467
468 /***
469 * Returns the header values as a string array.
470 *
471 * @return The header values as a String array.
472 * @exception IOException
473 * Thrown if this object has already been closed.
474 */
475 public String[] getHeaders() throws IOException {
476 checkClosed();
477
478 if (headersHolder.Headers == null) {
479 return null;
480 } else {
481
482
483
484 String[] clone = new String[headersHolder.Length];
485 System.arraycopy(headersHolder.Headers, 0, clone, 0, headersHolder.Length);
486 return clone;
487 }
488 }
489
490 @SuppressWarnings("unchecked")
491 public void setHeaders(String[] headers) {
492 headersHolder.Headers = headers;
493
494 headersHolder.IndexByName.clear();
495
496 if (headers != null) {
497 headersHolder.Length = headers.length;
498 } else {
499 headersHolder.Length = 0;
500 }
501
502
503 for (int i = 0; i < headersHolder.Length; i++) {
504 headersHolder.IndexByName.put(headers[i], new Integer(i));
505 }
506 }
507
508 public String[] getValues() throws IOException {
509 checkClosed();
510
511
512
513 String[] clone = new String[columnsCount];
514 System.arraycopy(values, 0, clone, 0, columnsCount);
515 return clone;
516 }
517
518 /***
519 * Returns the current column value for a given column index.
520 *
521 * @param columnIndex
522 * The index of the column.
523 * @return The current column value.
524 * @exception IOException
525 * Thrown if this object has already been closed.
526 */
527 public String get(int columnIndex) throws IOException {
528 checkClosed();
529
530 if (columnIndex > -1 && columnIndex < columnsCount) {
531 return values[columnIndex];
532 } else {
533 return "";
534 }
535 }
536
537 /***
538 * Returns the current column value for a given column header name.
539 *
540 * @param headerName
541 * The header name of the column.
542 * @return The current column value.
543 * @exception IOException
544 * Thrown if this object has already been closed.
545 */
546 public String get(String headerName) throws IOException {
547 checkClosed();
548
549 return get(getIndex(headerName));
550 }
551
552 /***
553 * Creates a {@link com.csvreader.CsvReader CsvReader} object using a string
554 * of data as the source. Uses ISO-8859-1 as the
555 * {@link java.nio.charset.Charset Charset}.
556 *
557 * @param data
558 * The String of data to use as the source.
559 * @return A {@link com.csvreader.CsvReader CsvReader} object using the
560 * String of data as the source.
561 */
562 public static CsvReader parse(String data) {
563 if (data == null) {
564 throw new IllegalArgumentException("Parameter data can not be null.");
565 }
566
567 return new CsvReader(new StringReader(data));
568 }
569
570 /***
571 * Reads another record.
572 *
573 * @return Whether another record was successfully read or not.
574 * @exception IOException
575 * Thrown if an error occurs while reading data from the
576 * source stream.
577 */
578 public boolean readRecord() throws IOException {
579 checkClosed();
580
581 columnsCount = 0;
582 rawBuffer.Position = 0;
583
584 dataBuffer.LineStart = dataBuffer.Position;
585
586 hasReadNextLine = false;
587
588
589
590 if (hasMoreData) {
591
592
593
594 do {
595 if (dataBuffer.Position == dataBuffer.Count) {
596 checkDataLength();
597 } else {
598 startedWithQualifier = false;
599
600
601
602 char currentLetter = dataBuffer.Buffer[dataBuffer.Position];
603
604 if (userSettings.UseTextQualifier && currentLetter == userSettings.TextQualifier) {
605
606
607
608
609
610 lastLetter = currentLetter;
611
612
613 startedColumn = true;
614 dataBuffer.ColumnStart = dataBuffer.Position + 1;
615 startedWithQualifier = true;
616 boolean lastLetterWasQualifier = false;
617
618 char escapeChar = userSettings.TextQualifier;
619
620 if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH) {
621 escapeChar = Letters.BACKSLASH;
622 }
623
624 boolean eatingTrailingJunk = false;
625 boolean lastLetterWasEscape = false;
626 boolean readingComplexEscape = false;
627 int escape = ComplexEscape.UNICODE;
628 int escapeLength = 0;
629 char escapeValue = (char) 0;
630
631 dataBuffer.Position++;
632
633 do {
634 if (dataBuffer.Position == dataBuffer.Count) {
635 checkDataLength();
636 } else {
637
638
639 currentLetter = dataBuffer.Buffer[dataBuffer.Position];
640
641 if (eatingTrailingJunk) {
642 dataBuffer.ColumnStart = dataBuffer.Position + 1;
643
644 if (currentLetter == userSettings.Delimiter) {
645 endColumn();
646 } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
647 || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
648 endColumn();
649
650 endRecord();
651 }
652 } else if (readingComplexEscape) {
653 escapeLength++;
654
655 switch (escape) {
656 case ComplexEscape.UNICODE:
657 escapeValue *= (char) 16;
658 escapeValue += hexToDec(currentLetter);
659
660 if (escapeLength == 4) {
661 readingComplexEscape = false;
662 }
663
664 break;
665 case ComplexEscape.OCTAL:
666 escapeValue *= (char) 8;
667 escapeValue += (char) (currentLetter - '0');
668
669 if (escapeLength == 3) {
670 readingComplexEscape = false;
671 }
672
673 break;
674 case ComplexEscape.DECIMAL:
675 escapeValue *= (char) 10;
676 escapeValue += (char) (currentLetter - '0');
677
678 if (escapeLength == 3) {
679 readingComplexEscape = false;
680 }
681
682 break;
683 case ComplexEscape.HEX:
684 escapeValue *= (char) 16;
685 escapeValue += hexToDec(currentLetter);
686
687 if (escapeLength == 2) {
688 readingComplexEscape = false;
689 }
690
691 break;
692 }
693
694 if (!readingComplexEscape) {
695 appendLetter(escapeValue);
696 } else {
697 dataBuffer.ColumnStart = dataBuffer.Position + 1;
698 }
699 } else if (currentLetter == userSettings.TextQualifier) {
700 if (lastLetterWasEscape) {
701 lastLetterWasEscape = false;
702 lastLetterWasQualifier = false;
703 } else {
704 updateCurrentValue();
705
706 if (userSettings.EscapeMode == ESCAPE_MODE_DOUBLED) {
707 lastLetterWasEscape = true;
708 }
709
710 lastLetterWasQualifier = true;
711 }
712 } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasEscape) {
713 switch (currentLetter) {
714 case 'n':
715 appendLetter(Letters.LF);
716 break;
717 case 'r':
718 appendLetter(Letters.CR);
719 break;
720 case 't':
721 appendLetter(Letters.TAB);
722 break;
723 case 'b':
724 appendLetter(Letters.BACKSPACE);
725 break;
726 case 'f':
727 appendLetter(Letters.FORM_FEED);
728 break;
729 case 'e':
730 appendLetter(Letters.ESCAPE);
731 break;
732 case 'v':
733 appendLetter(Letters.VERTICAL_TAB);
734 break;
735 case 'a':
736 appendLetter(Letters.ALERT);
737 break;
738 case '0':
739 case '1':
740 case '2':
741 case '3':
742 case '4':
743 case '5':
744 case '6':
745 case '7':
746 escape = ComplexEscape.OCTAL;
747 readingComplexEscape = true;
748 escapeLength = 1;
749 escapeValue = (char) (currentLetter - '0');
750 dataBuffer.ColumnStart = dataBuffer.Position + 1;
751 break;
752 case 'u':
753 case 'x':
754 case 'o':
755 case 'd':
756 case 'U':
757 case 'X':
758 case 'O':
759 case 'D':
760 switch (currentLetter) {
761 case 'u':
762 case 'U':
763 escape = ComplexEscape.UNICODE;
764 break;
765 case 'x':
766 case 'X':
767 escape = ComplexEscape.HEX;
768 break;
769 case 'o':
770 case 'O':
771 escape = ComplexEscape.OCTAL;
772 break;
773 case 'd':
774 case 'D':
775 escape = ComplexEscape.DECIMAL;
776 break;
777 }
778
779 readingComplexEscape = true;
780 escapeLength = 0;
781 escapeValue = (char) 0;
782 dataBuffer.ColumnStart = dataBuffer.Position + 1;
783
784 break;
785 default:
786 break;
787 }
788
789 lastLetterWasEscape = false;
790
791
792 } else if (currentLetter == escapeChar) {
793 updateCurrentValue();
794 lastLetterWasEscape = true;
795 } else {
796 if (lastLetterWasQualifier) {
797 if (currentLetter == userSettings.Delimiter) {
798 endColumn();
799 } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
800 || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
801 endColumn();
802
803 endRecord();
804 } else {
805 dataBuffer.ColumnStart = dataBuffer.Position + 1;
806
807 eatingTrailingJunk = true;
808 }
809
810
811
812
813 lastLetterWasQualifier = false;
814 }
815 }
816
817
818
819
820 lastLetter = currentLetter;
821
822 if (startedColumn) {
823 dataBuffer.Position++;
824
825 if (userSettings.SafetySwitch
826 && dataBuffer.Position - dataBuffer.ColumnStart + columnBuffer.Position > 100000) {
827 close();
828
829 throw new IOException(
830 "Maximum column length of 100,000 exceeded in column "
831 + NumberFormat.getIntegerInstance().format(columnsCount)
832 + " in record "
833 + NumberFormat.getIntegerInstance().format(currentRecord)
834 + ". Set the SafetySwitch property to false"
835 + " if you're expecting column lengths greater than 100,000 characters to"
836 + " avoid this error.");
837 }
838 }
839 }
840
841 } while (hasMoreData && startedColumn);
842 } else if (currentLetter == userSettings.Delimiter) {
843
844
845
846 lastLetter = currentLetter;
847
848 endColumn();
849 } else if (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter) {
850
851 if (startedColumn || columnsCount > 0 || !userSettings.SkipEmptyRecords) {
852 endColumn();
853
854 endRecord();
855 } else {
856 dataBuffer.LineStart = dataBuffer.Position + 1;
857 }
858
859 lastLetter = currentLetter;
860 } else if (!useCustomRecordDelimiter
861 && (currentLetter == Letters.CR || currentLetter == Letters.LF)) {
862
863 if (startedColumn
864 || columnsCount > 0
865 || (!userSettings.SkipEmptyRecords && (currentLetter == Letters.CR || lastLetter != Letters.CR))) {
866 endColumn();
867
868 endRecord();
869 } else {
870 dataBuffer.LineStart = dataBuffer.Position + 1;
871 }
872
873 lastLetter = currentLetter;
874 } else if (userSettings.UseComments && columnsCount == 0 && currentLetter == userSettings.Comment) {
875
876
877
878 lastLetter = currentLetter;
879
880 skipLine();
881 } else if (userSettings.TrimWhitespace
882 && (currentLetter == Letters.SPACE || currentLetter == Letters.TAB)) {
883
884
885
886 startedColumn = true;
887 dataBuffer.ColumnStart = dataBuffer.Position + 1;
888 } else {
889
890
891
892 startedColumn = true;
893 dataBuffer.ColumnStart = dataBuffer.Position;
894 boolean lastLetterWasBackslash = false;
895 boolean readingComplexEscape = false;
896 int escape = ComplexEscape.UNICODE;
897 int escapeLength = 0;
898 char escapeValue = (char) 0;
899
900 boolean firstLoop = true;
901
902 do {
903 if (!firstLoop && dataBuffer.Position == dataBuffer.Count) {
904 checkDataLength();
905 } else {
906 if (!firstLoop) {
907
908 currentLetter = dataBuffer.Buffer[dataBuffer.Position];
909 }
910
911 if (!userSettings.UseTextQualifier && userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH
912 && currentLetter == Letters.BACKSLASH) {
913 if (lastLetterWasBackslash) {
914 lastLetterWasBackslash = false;
915 } else {
916 updateCurrentValue();
917 lastLetterWasBackslash = true;
918 }
919 } else if (readingComplexEscape) {
920 escapeLength++;
921
922 switch (escape) {
923 case ComplexEscape.UNICODE:
924 escapeValue *= (char) 16;
925 escapeValue += hexToDec(currentLetter);
926
927 if (escapeLength == 4) {
928 readingComplexEscape = false;
929 }
930
931 break;
932 case ComplexEscape.OCTAL:
933 escapeValue *= (char) 8;
934 escapeValue += (char) (currentLetter - '0');
935
936 if (escapeLength == 3) {
937 readingComplexEscape = false;
938 }
939
940 break;
941 case ComplexEscape.DECIMAL:
942 escapeValue *= (char) 10;
943 escapeValue += (char) (currentLetter - '0');
944
945 if (escapeLength == 3) {
946 readingComplexEscape = false;
947 }
948
949 break;
950 case ComplexEscape.HEX:
951 escapeValue *= (char) 16;
952 escapeValue += hexToDec(currentLetter);
953
954 if (escapeLength == 2) {
955 readingComplexEscape = false;
956 }
957
958 break;
959 }
960
961 if (!readingComplexEscape) {
962 appendLetter(escapeValue);
963 } else {
964 dataBuffer.ColumnStart = dataBuffer.Position + 1;
965 }
966 } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasBackslash) {
967 switch (currentLetter) {
968 case 'n':
969 appendLetter(Letters.LF);
970 break;
971 case 'r':
972 appendLetter(Letters.CR);
973 break;
974 case 't':
975 appendLetter(Letters.TAB);
976 break;
977 case 'b':
978 appendLetter(Letters.BACKSPACE);
979 break;
980 case 'f':
981 appendLetter(Letters.FORM_FEED);
982 break;
983 case 'e':
984 appendLetter(Letters.ESCAPE);
985 break;
986 case 'v':
987 appendLetter(Letters.VERTICAL_TAB);
988 break;
989 case 'a':
990 appendLetter(Letters.ALERT);
991 break;
992 case '0':
993 case '1':
994 case '2':
995 case '3':
996 case '4':
997 case '5':
998 case '6':
999 case '7':
1000 escape = ComplexEscape.OCTAL;
1001 readingComplexEscape = true;
1002 escapeLength = 1;
1003 escapeValue = (char) (currentLetter - '0');
1004 dataBuffer.ColumnStart = dataBuffer.Position + 1;
1005 break;
1006 case 'u':
1007 case 'x':
1008 case 'o':
1009 case 'd':
1010 case 'U':
1011 case 'X':
1012 case 'O':
1013 case 'D':
1014 switch (currentLetter) {
1015 case 'u':
1016 case 'U':
1017 escape = ComplexEscape.UNICODE;
1018 break;
1019 case 'x':
1020 case 'X':
1021 escape = ComplexEscape.HEX;
1022 break;
1023 case 'o':
1024 case 'O':
1025 escape = ComplexEscape.OCTAL;
1026 break;
1027 case 'd':
1028 case 'D':
1029 escape = ComplexEscape.DECIMAL;
1030 break;
1031 }
1032
1033 readingComplexEscape = true;
1034 escapeLength = 0;
1035 escapeValue = (char) 0;
1036 dataBuffer.ColumnStart = dataBuffer.Position + 1;
1037
1038 break;
1039 default:
1040 break;
1041 }
1042
1043 lastLetterWasBackslash = false;
1044 } else {
1045 if (currentLetter == userSettings.Delimiter) {
1046 endColumn();
1047 } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
1048 || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
1049 endColumn();
1050
1051 endRecord();
1052 }
1053 }
1054
1055
1056
1057
1058 lastLetter = currentLetter;
1059 firstLoop = false;
1060
1061 if (startedColumn) {
1062 dataBuffer.Position++;
1063
1064 if (userSettings.SafetySwitch
1065 && dataBuffer.Position - dataBuffer.ColumnStart + columnBuffer.Position > 100000) {
1066 close();
1067
1068 throw new IOException(
1069 "Maximum column length of 100,000 exceeded in column "
1070 + NumberFormat.getIntegerInstance().format(columnsCount)
1071 + " in record "
1072 + NumberFormat.getIntegerInstance().format(currentRecord)
1073 + ". Set the SafetySwitch property to false"
1074 + " if you're expecting column lengths greater than 100,000 characters to"
1075 + " avoid this error.");
1076 }
1077 }
1078 }
1079 } while (hasMoreData && startedColumn);
1080 }
1081
1082 if (hasMoreData) {
1083 dataBuffer.Position++;
1084 }
1085 }
1086 } while (hasMoreData && !hasReadNextLine);
1087
1088
1089
1090
1091 if (startedColumn || lastLetter == userSettings.Delimiter) {
1092 endColumn();
1093
1094 endRecord();
1095 }
1096 }
1097
1098 if (userSettings.CaptureRawRecord) {
1099 if (hasMoreData) {
1100 if (rawBuffer.Position == 0) {
1101 rawRecord = new String(dataBuffer.Buffer, dataBuffer.LineStart, dataBuffer.Position
1102 - dataBuffer.LineStart - 1);
1103 } else {
1104 rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position)
1105 + new String(dataBuffer.Buffer, dataBuffer.LineStart, dataBuffer.Position
1106 - dataBuffer.LineStart - 1);
1107 }
1108 } else {
1109
1110
1111
1112 rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position);
1113 }
1114 } else {
1115 rawRecord = "";
1116 }
1117
1118 return hasReadNextLine;
1119 }
1120
1121 /***
1122 * @exception IOException
1123 * Thrown if an error occurs while reading data from the
1124 * source stream.
1125 */
1126 private void checkDataLength() throws IOException {
1127 if (!initialized) {
1128 if (fileName != null) {
1129 inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charset),
1130 StaticSettings.MAX_FILE_BUFFER_SIZE);
1131 }
1132
1133 charset = null;
1134 initialized = true;
1135 }
1136
1137 updateCurrentValue();
1138
1139 if (userSettings.CaptureRawRecord && dataBuffer.Count > 0) {
1140 if (rawBuffer.Buffer.length - rawBuffer.Position < dataBuffer.Count - dataBuffer.LineStart) {
1141 int newLength = rawBuffer.Buffer.length
1142 + Math.max(dataBuffer.Count - dataBuffer.LineStart, rawBuffer.Buffer.length);
1143
1144 char[] holder = new char[newLength];
1145
1146 System.arraycopy(rawBuffer.Buffer, 0, holder, 0, rawBuffer.Position);
1147
1148 rawBuffer.Buffer = holder;
1149 }
1150
1151 System.arraycopy(dataBuffer.Buffer, dataBuffer.LineStart, rawBuffer.Buffer, rawBuffer.Position,
1152 dataBuffer.Count - dataBuffer.LineStart);
1153
1154 rawBuffer.Position += dataBuffer.Count - dataBuffer.LineStart;
1155 }
1156
1157 try {
1158 dataBuffer.Count = inputStream.read(dataBuffer.Buffer, 0, dataBuffer.Buffer.length);
1159 } catch (IOException ex) {
1160 close();
1161
1162 throw ex;
1163 }
1164
1165
1166
1167
1168 if (dataBuffer.Count == -1) {
1169 hasMoreData = false;
1170 }
1171
1172 dataBuffer.Position = 0;
1173 dataBuffer.LineStart = 0;
1174 dataBuffer.ColumnStart = 0;
1175 }
1176
1177 /***
1178 * Read the first record of data as column headers.
1179 *
1180 * @return Whether the header record was successfully read or not.
1181 * @exception IOException
1182 * Thrown if an error occurs while reading data from the
1183 * source stream.
1184 */
1185 @SuppressWarnings("unchecked")
1186 public boolean readHeaders() throws IOException {
1187 boolean result = readRecord();
1188
1189
1190
1191
1192 headersHolder.Length = columnsCount;
1193
1194 headersHolder.Headers = new String[columnsCount];
1195
1196 for (int i = 0; i < headersHolder.Length; i++) {
1197 String columnValue = get(i);
1198
1199 headersHolder.Headers[i] = columnValue;
1200
1201
1202 headersHolder.IndexByName.put(columnValue, new Integer(i));
1203 }
1204
1205 if (result) {
1206 currentRecord--;
1207 }
1208
1209 columnsCount = 0;
1210
1211 return result;
1212 }
1213
1214 /***
1215 * Returns the column header value for a given column index.
1216 *
1217 * @param columnIndex
1218 * The index of the header column being requested.
1219 * @return The value of the column header at the given column index.
1220 * @exception IOException
1221 * Thrown if this object has already been closed.
1222 */
1223 public String getHeader(int columnIndex) throws IOException {
1224 checkClosed();
1225
1226
1227
1228
1229
1230
1231 if (columnIndex > -1 && columnIndex < headersHolder.Length) {
1232
1233
1234 return headersHolder.Headers[columnIndex];
1235 } else {
1236 return "";
1237 }
1238 }
1239
1240 public boolean isQualified(int columnIndex) throws IOException {
1241 checkClosed();
1242
1243 if (columnIndex < columnsCount && columnIndex > -1) {
1244 return isQualified[columnIndex];
1245 } else {
1246 return false;
1247 }
1248 }
1249
1250 /***
1251 * @exception IOException
1252 * Thrown if a very rare extreme exception occurs during
1253 * parsing, normally resulting from improper data format.
1254 */
1255 private void endColumn() throws IOException {
1256 String currentValue = "";
1257
1258
1259 if (startedColumn) {
1260 if (columnBuffer.Position == 0) {
1261 if (dataBuffer.ColumnStart < dataBuffer.Position) {
1262 int lastLetter = dataBuffer.Position - 1;
1263
1264 if (userSettings.TrimWhitespace && !startedWithQualifier) {
1265 while (lastLetter >= dataBuffer.ColumnStart
1266 && (dataBuffer.Buffer[lastLetter] == Letters.SPACE || dataBuffer.Buffer[lastLetter] == Letters.TAB)) {
1267 lastLetter--;
1268 }
1269 }
1270
1271 currentValue = new String(dataBuffer.Buffer, dataBuffer.ColumnStart, lastLetter
1272 - dataBuffer.ColumnStart + 1);
1273 }
1274 } else {
1275 updateCurrentValue();
1276
1277 int lastLetter = columnBuffer.Position - 1;
1278
1279 if (userSettings.TrimWhitespace && !startedWithQualifier) {
1280 while (lastLetter >= 0
1281 && (columnBuffer.Buffer[lastLetter] == Letters.SPACE || columnBuffer.Buffer[lastLetter] == Letters.SPACE)) {
1282 lastLetter--;
1283 }
1284 }
1285
1286 currentValue = new String(columnBuffer.Buffer, 0, lastLetter + 1);
1287 }
1288 }
1289
1290 columnBuffer.Position = 0;
1291
1292 startedColumn = false;
1293
1294 if (columnsCount >= 100000 && userSettings.SafetySwitch) {
1295 close();
1296
1297 throw new IOException("Maximum column count of 100,000 exceeded in record "
1298 + NumberFormat.getIntegerInstance().format(currentRecord)
1299 + ". Set the SafetySwitch property to false"
1300 + " if you're expecting more than 100,000 columns per record to" + " avoid this error.");
1301 }
1302
1303
1304
1305
1306
1307 if (columnsCount == values.length) {
1308
1309 int newLength = values.length * 2;
1310
1311 String[] holder = new String[newLength];
1312
1313 System.arraycopy(values, 0, holder, 0, values.length);
1314
1315 values = holder;
1316
1317 boolean[] qualifiedHolder = new boolean[newLength];
1318
1319 System.arraycopy(isQualified, 0, qualifiedHolder, 0, isQualified.length);
1320
1321 isQualified = qualifiedHolder;
1322 }
1323
1324
1325
1326 if (!startedWithQualifier && currentValue.equals("")) {
1327 currentValue = null;
1328 }
1329
1330
1331 values[columnsCount] = currentValue;
1332
1333 isQualified[columnsCount] = startedWithQualifier;
1334
1335 currentValue = "";
1336
1337 columnsCount++;
1338 }
1339
1340 private void appendLetter(char letter) {
1341 if (columnBuffer.Position == columnBuffer.Buffer.length) {
1342 int newLength = columnBuffer.Buffer.length * 2;
1343
1344 char[] holder = new char[newLength];
1345
1346 System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position);
1347
1348 columnBuffer.Buffer = holder;
1349 }
1350 columnBuffer.Buffer[columnBuffer.Position++] = letter;
1351 dataBuffer.ColumnStart = dataBuffer.Position + 1;
1352 }
1353
1354 private void updateCurrentValue() {
1355 if (startedColumn && dataBuffer.ColumnStart < dataBuffer.Position) {
1356 if (columnBuffer.Buffer.length - columnBuffer.Position < dataBuffer.Position - dataBuffer.ColumnStart) {
1357 int newLength = columnBuffer.Buffer.length
1358 + Math.max(dataBuffer.Position - dataBuffer.ColumnStart, columnBuffer.Buffer.length);
1359
1360 char[] holder = new char[newLength];
1361
1362 System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position);
1363
1364 columnBuffer.Buffer = holder;
1365 }
1366
1367 System.arraycopy(dataBuffer.Buffer, dataBuffer.ColumnStart, columnBuffer.Buffer, columnBuffer.Position,
1368 dataBuffer.Position - dataBuffer.ColumnStart);
1369
1370 columnBuffer.Position += dataBuffer.Position - dataBuffer.ColumnStart;
1371 }
1372
1373 dataBuffer.ColumnStart = dataBuffer.Position + 1;
1374 }
1375
1376 /***
1377 * @exception IOException
1378 * Thrown if an error occurs while reading data from the
1379 * source stream.
1380 */
1381 private void endRecord() throws IOException {
1382
1383
1384
1385 hasReadNextLine = true;
1386
1387 currentRecord++;
1388 }
1389
1390 /***
1391 * Gets the corresponding column index for a given column header name.
1392 *
1393 * @param headerName
1394 * The header name of the column.
1395 * @return The column index for the given column header name. Returns
1396 * -1 if not found.
1397 * @exception IOException
1398 * Thrown if this object has already been closed.
1399 */
1400 public int getIndex(String headerName) throws IOException {
1401 checkClosed();
1402
1403 Object indexValue = headersHolder.IndexByName.get(headerName);
1404
1405 if (indexValue != null) {
1406 return ((Integer) indexValue).intValue();
1407 } else {
1408 return -1;
1409 }
1410 }
1411
1412 /***
1413 * Skips the next record of data by parsing each column. Does not
1414 * increment
1415 * {@link com.csvreader.CsvReader#getCurrentRecord getCurrentRecord()}.
1416 *
1417 * @return Whether another record was successfully skipped or not.
1418 * @exception IOException
1419 * Thrown if an error occurs while reading data from the
1420 * source stream.
1421 */
1422 public boolean skipRecord() throws IOException {
1423 checkClosed();
1424
1425 boolean recordRead = false;
1426
1427 if (hasMoreData) {
1428 recordRead = readRecord();
1429
1430 if (recordRead) {
1431 currentRecord--;
1432 }
1433 }
1434
1435 return recordRead;
1436 }
1437
1438 /***
1439 * Skips the next line of data using the standard end of line characters and
1440 * does not do any column delimited parsing.
1441 *
1442 * @return Whether a line was successfully skipped or not.
1443 * @exception IOException
1444 * Thrown if an error occurs while reading data from the
1445 * source stream.
1446 */
1447 public boolean skipLine() throws IOException {
1448 checkClosed();
1449
1450
1451
1452 columnsCount = 0;
1453
1454 boolean skippedLine = false;
1455
1456 if (hasMoreData) {
1457 boolean foundEol = false;
1458
1459 do {
1460 if (dataBuffer.Position == dataBuffer.Count) {
1461 checkDataLength();
1462 } else {
1463 skippedLine = true;
1464
1465
1466
1467 char currentLetter = dataBuffer.Buffer[dataBuffer.Position];
1468
1469 if (currentLetter == Letters.CR || currentLetter == Letters.LF) {
1470 foundEol = true;
1471 }
1472
1473
1474
1475
1476 lastLetter = currentLetter;
1477
1478 if (!foundEol) {
1479 dataBuffer.Position++;
1480 }
1481
1482 }
1483 } while (hasMoreData && !foundEol);
1484
1485 columnBuffer.Position = 0;
1486
1487 dataBuffer.LineStart = dataBuffer.Position + 1;
1488 }
1489
1490 rawBuffer.Position = 0;
1491 rawRecord = "";
1492
1493 return skippedLine;
1494 }
1495
1496 /***
1497 * Closes and releases all related resources.
1498 */
1499 public void close() {
1500 if (!closed) {
1501 close(true);
1502
1503 closed = true;
1504 }
1505 }
1506
1507 /***
1508 *
1509 */
1510 private void close(boolean closing) {
1511 if (!closed) {
1512 if (closing) {
1513 charset = null;
1514 headersHolder.Headers = null;
1515 headersHolder.IndexByName = null;
1516 dataBuffer.Buffer = null;
1517 columnBuffer.Buffer = null;
1518 rawBuffer.Buffer = null;
1519 }
1520
1521 try {
1522 if (initialized) {
1523 inputStream.close();
1524 }
1525 } catch (Exception e) {
1526
1527 }
1528
1529 inputStream = null;
1530
1531 closed = true;
1532 }
1533 }
1534
1535 /***
1536 * @exception IOException
1537 * Thrown if this object has already been closed.
1538 */
1539 private void checkClosed() throws IOException {
1540 if (closed) {
1541 throw new IOException("This instance of the CsvReader class has already been closed.");
1542 }
1543 }
1544
1545 /***
1546 *
1547 */
1548 protected void finalize() {
1549 close(false);
1550 }
1551
1552 private class ComplexEscape {
1553 private static final int UNICODE = 1;
1554
1555 private static final int OCTAL = 2;
1556
1557 private static final int DECIMAL = 3;
1558
1559 private static final int HEX = 4;
1560 }
1561
1562 private static char hexToDec(char hex) {
1563 char result;
1564
1565 if (hex >= 'a') {
1566 result = (char) (hex - 'a' + 10);
1567 } else if (hex >= 'A') {
1568 result = (char) (hex - 'A' + 10);
1569 } else {
1570 result = (char) (hex - '0');
1571 }
1572
1573 return result;
1574 }
1575
1576 private class DataBuffer {
1577 public char[] Buffer;
1578
1579 public int Position;
1580
1581
1582
1583
1584
1585 public int Count;
1586
1587
1588
1589
1590
1591
1592 public int ColumnStart;
1593
1594 public int LineStart;
1595
1596 public DataBuffer() {
1597 Buffer = new char[StaticSettings.MAX_BUFFER_SIZE];
1598 Position = 0;
1599 Count = 0;
1600 ColumnStart = 0;
1601 LineStart = 0;
1602 }
1603 }
1604
1605 private class ColumnBuffer {
1606 public char[] Buffer;
1607
1608 public int Position;
1609
1610 public ColumnBuffer() {
1611 Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE];
1612 Position = 0;
1613 }
1614 }
1615
1616 private class RawRecordBuffer {
1617 public char[] Buffer;
1618
1619 public int Position;
1620
1621 public RawRecordBuffer() {
1622 Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE * StaticSettings.INITIAL_COLUMN_COUNT];
1623 Position = 0;
1624 }
1625 }
1626
1627 private class Letters {
1628 public static final char LF = '\n';
1629
1630 public static final char CR = '\r';
1631
1632 public static final char QUOTE = '"';
1633
1634 public static final char COMMA = ',';
1635
1636 public static final char SPACE = ' ';
1637
1638 public static final char TAB = '\t';
1639
1640 public static final char POUND = '#';
1641
1642 public static final char BACKSLASH = '//';
1643
1644 public static final char NULL = '\0';
1645
1646 public static final char BACKSPACE = '\b';
1647
1648 public static final char FORM_FEED = '\f';
1649
1650 public static final char ESCAPE = '\u001B';
1651
1652 public static final char VERTICAL_TAB = '\u000B';
1653
1654 public static final char ALERT = '\u0007';
1655 }
1656
1657 private class UserSettings {
1658
1659
1660 public boolean CaseSensitive;
1661
1662 public char TextQualifier;
1663
1664 public boolean TrimWhitespace;
1665
1666 public boolean UseTextQualifier;
1667
1668 public char Delimiter;
1669
1670 public char RecordDelimiter;
1671
1672 public char Comment;
1673
1674 public boolean UseComments;
1675
1676 public int EscapeMode;
1677
1678 public boolean SafetySwitch;
1679
1680 public boolean SkipEmptyRecords;
1681
1682 public boolean CaptureRawRecord;
1683
1684 public UserSettings() {
1685 CaseSensitive = true;
1686 TextQualifier = Letters.QUOTE;
1687 TrimWhitespace = true;
1688 UseTextQualifier = true;
1689 Delimiter = Letters.COMMA;
1690 RecordDelimiter = Letters.NULL;
1691 Comment = Letters.POUND;
1692 UseComments = false;
1693 EscapeMode = CsvReader.ESCAPE_MODE_DOUBLED;
1694 SafetySwitch = true;
1695 SkipEmptyRecords = true;
1696 CaptureRawRecord = true;
1697 }
1698 }
1699
1700 private class HeadersHolder {
1701 public String[] Headers;
1702
1703 public int Length;
1704
1705 public HashMap IndexByName;
1706
1707 public HeadersHolder() {
1708 Headers = null;
1709 Length = 0;
1710 IndexByName = new HashMap();
1711 }
1712 }
1713
1714 private class StaticSettings {
1715
1716
1717
1718 public static final int MAX_BUFFER_SIZE = 1024;
1719
1720 public static final int MAX_FILE_BUFFER_SIZE = 4 * 1024;
1721
1722 public static final int INITIAL_COLUMN_COUNT = 10;
1723
1724 public static final int INITIAL_COLUMN_BUFFER_SIZE = 50;
1725 }
1726 }