1 | /* |
2 | * Copyright 2006-2007 the original author or authors. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | package org.springframework.batch.item.file.transform; |
18 | |
19 | import java.util.ArrayList; |
20 | import java.util.List; |
21 | |
22 | import org.springframework.util.Assert; |
23 | import org.springframework.util.StringUtils; |
24 | |
25 | /** |
26 | * |
27 | * @author Rob Harrop |
28 | * @author Dave Syer |
29 | * |
30 | */ |
31 | public class DelimitedLineTokenizer extends AbstractLineTokenizer { |
32 | /** |
33 | * Convenient constant for the common case of a tab delimiter. |
34 | */ |
35 | public static final char DELIMITER_TAB = '\t'; |
36 | |
37 | /** |
38 | * Convenient constant for the common case of a comma delimiter. |
39 | */ |
40 | public static final char DELIMITER_COMMA = ','; |
41 | |
42 | /** |
43 | * Convenient constant for the common case of a " character used to escape delimiters or line endings. |
44 | */ |
45 | public static final char DEFAULT_QUOTE_CHARACTER = '"'; |
46 | |
47 | // the delimiter character used when reading input. |
48 | private char delimiter; |
49 | |
50 | private char quoteCharacter = DEFAULT_QUOTE_CHARACTER; |
51 | |
52 | private String quoteString; |
53 | |
54 | /** |
55 | * Create a new instance of the {@link DelimitedLineTokenizer} class for the common case where the delimiter is a |
56 | * {@link #DELIMITER_COMMA comma}. |
57 | * |
58 | * @see #DelimitedLineTokenizer(char) |
59 | * @see #DELIMITER_COMMA |
60 | */ |
61 | public DelimitedLineTokenizer() { |
62 | this(DELIMITER_COMMA); |
63 | } |
64 | |
65 | /** |
66 | * Create a new instance of the {@link DelimitedLineTokenizer} class. |
67 | * |
68 | * @param delimiter the desired delimiter |
69 | */ |
70 | public DelimitedLineTokenizer(char delimiter) { |
71 | Assert.state(delimiter != DEFAULT_QUOTE_CHARACTER, "[" + DEFAULT_QUOTE_CHARACTER |
72 | + "] is not allowed as delimiter for tokenizers."); |
73 | |
74 | this.delimiter = delimiter; |
75 | setQuoteCharacter(DEFAULT_QUOTE_CHARACTER); |
76 | } |
77 | |
78 | /** |
79 | * Setter for the delimiter character. |
80 | * |
81 | * @param delimiter |
82 | */ |
83 | public void setDelimiter(char delimiter) { |
84 | this.delimiter = delimiter; |
85 | } |
86 | |
87 | /** |
88 | * Public setter for the quoteCharacter. The quote character can be used to extend a field across line endings or to |
89 | * enclose a String which contains the delimiter. Inside a quoted token the quote character can be used to escape |
90 | * itself, thus "a""b""c" is tokenized to a"b"c. |
91 | * |
92 | * @param quoteCharacter the quoteCharacter to set |
93 | * |
94 | * @see #DEFAULT_QUOTE_CHARACTER |
95 | */ |
96 | public void setQuoteCharacter(char quoteCharacter) { |
97 | this.quoteCharacter = quoteCharacter; |
98 | this.quoteString = "" + quoteCharacter; |
99 | } |
100 | |
101 | /** |
102 | * Yields the tokens resulting from the splitting of the supplied <code>line</code>. |
103 | * |
104 | * @param line the line to be tokenized |
105 | * |
106 | * @return the resulting tokens |
107 | */ |
108 | protected List doTokenize(String line) { |
109 | |
110 | List tokens = new ArrayList(); |
111 | |
112 | // line is never null in current implementation |
113 | // line is checked in parent: AbstractLineTokenizer.tokenize() |
114 | char[] chars = line.toCharArray(); |
115 | boolean inQuoted = false; |
116 | int lastCut = 0; |
117 | int length = chars.length; |
118 | |
119 | for (int i = 0; i < length; i++) { |
120 | |
121 | char currentChar = chars[i]; |
122 | boolean isEnd = (i == (length - 1)); |
123 | |
124 | if ((isDelimiterCharacter(currentChar) && !inQuoted) || isEnd) { |
125 | int endPosition = (isEnd ? (length - lastCut) : (i - lastCut)); |
126 | |
127 | if (isEnd && isDelimiterCharacter(currentChar)) { |
128 | endPosition--; |
129 | } |
130 | |
131 | String value = null; |
132 | |
133 | value = maybeStripQuotes(new String(chars, lastCut, endPosition)); |
134 | |
135 | tokens.add(value); |
136 | |
137 | if (isEnd && (isDelimiterCharacter(currentChar))) { |
138 | tokens.add(""); |
139 | } |
140 | |
141 | lastCut = i + 1; |
142 | } else if (isQuoteCharacter(currentChar)) { |
143 | inQuoted = !inQuoted; |
144 | } |
145 | |
146 | } |
147 | |
148 | return tokens; |
149 | } |
150 | |
151 | /** |
152 | * If the string is quoted strip (possibly with whitespace outside the quotes (which will be stripped), replace |
153 | * escaped quotes inside the string. Quotes are escaped with double instances of the quote character. |
154 | * |
155 | * @param string |
156 | * @return the same string but stripped and unescaped if necessary |
157 | */ |
158 | private String maybeStripQuotes(String string) { |
159 | String value = string.trim(); |
160 | if (isQuoted(value)) { |
161 | value = StringUtils.replace(value, "" + quoteCharacter + quoteCharacter, "" + quoteCharacter); |
162 | int endLength = value.length() - 1; |
163 | // used to deal with empty quoted values |
164 | if (endLength == 0) { |
165 | endLength = 1; |
166 | } |
167 | string = value.substring(1, endLength); |
168 | } |
169 | return string; |
170 | } |
171 | |
172 | /** |
173 | * Is this string surrounded by quite characters? |
174 | * |
175 | * @param value |
176 | * @return true if the value starts and ends with the {@link #quoteCharacter} |
177 | */ |
178 | private boolean isQuoted(String value) { |
179 | if (value.startsWith(quoteString) && value.endsWith(quoteString)) { |
180 | return true; |
181 | } |
182 | return false; |
183 | } |
184 | |
185 | /** |
186 | * Is the supplied character the delimiter character? |
187 | * |
188 | * @param c the character to be checked |
189 | * @return <code>true</code> if the supplied character is the delimiter character |
190 | * @see DelimitedLineTokenizer#DelimitedLineTokenizer(char) |
191 | */ |
192 | private boolean isDelimiterCharacter(char c) { |
193 | return c == this.delimiter; |
194 | } |
195 | |
196 | /** |
197 | * Is the supplied character a quote character? |
198 | * |
199 | * @param c the character to be checked |
200 | * @return <code>true</code> if the supplied character is an quote character |
201 | * @see #setQuoteCharacter(char) |
202 | */ |
203 | protected boolean isQuoteCharacter(char c) { |
204 | return c == quoteCharacter; |
205 | } |
206 | } |