View Javadoc

1   /*
2    * Copyright 2006-2013 the original author or authors.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.springframework.batch.item.file.transform;
18  
19  import java.util.ArrayList;
20  import java.util.Collection;
21  import java.util.HashSet;
22  import java.util.List;
23  
24  import org.springframework.util.Assert;
25  import org.springframework.util.StringUtils;
26  
27  /**
28   * A {@link LineTokenizer} implementation that splits the input String on a
29   * configurable delimiter. This implementation also supports the use of an
30   * escape character to escape delimiters and line endings.
31   *
32   * @author Rob Harrop
33   * @author Dave Syer
34   * @author Michael Minella
35   */
36  public class DelimitedLineTokenizer extends AbstractLineTokenizer {
37  	/**
38  	 * Convenient constant for the common case of a tab delimiter.
39  	 */
40  	public static final String DELIMITER_TAB = "\t";
41  
42  	/**
43  	 * Convenient constant for the common case of a comma delimiter.
44  	 */
45  	public static final String DELIMITER_COMMA = ",";
46  
47  	/**
48  	 * Convenient constant for the common case of a " character used to escape
49  	 * delimiters or line endings.
50  	 */
51  	public static final char DEFAULT_QUOTE_CHARACTER = '"';
52  
53  	// the delimiter character used when reading input.
54  	private String delimiter;
55  
56  	private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;
57  
58  	private String quoteString;
59  
60  	private Collection<Integer> includedFields = null;
61  
62  	/**
63  	 * Create a new instance of the {@link DelimitedLineTokenizer} class for the
64  	 * common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
65  	 *
66  	 * @see #DelimitedLineTokenizer(String)
67  	 * @see #DELIMITER_COMMA
68  	 */
69  	public DelimitedLineTokenizer() {
70  		this(DELIMITER_COMMA);
71  	}
72  
73  	/**
74  	 * Create a new instance of the {@link DelimitedLineTokenizer} class.
75  	 *
76  	 * @param delimiter the desired delimiter
77  	 */
78  	public DelimitedLineTokenizer(String delimiter) {
79  		Assert.state(!delimiter.equals(String.valueOf(DEFAULT_QUOTE_CHARACTER)), "[" + DEFAULT_QUOTE_CHARACTER
80  				+ "] is not allowed as delimiter for tokenizers.");
81  
82  		this.delimiter = delimiter;
83  		setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
84  	}
85  
86  	/**
87  	 * Setter for the delimiter character.
88  	 *
89  	 * @param delimiter
90  	 */
91  	public void setDelimiter(String delimiter) {
92  		this.delimiter = delimiter;
93  	}
94  
95  	/**
96  	 * The fields to include in the output by position (starting at 0). By
97  	 * default all fields are included, but this property can be set to pick out
98  	 * only a few fields from a larger set. Note that if field names are
99  	 * provided, their number must match the number of included fields.
100 	 *
101 	 * @param includedFields the included fields to set
102 	 */
103 	public void setIncludedFields(int[] includedFields) {
104 		this.includedFields = new HashSet<Integer>();
105 		for (int i : includedFields) {
106 			this.includedFields.add(i);
107 		}
108 	}
109 
110 	/**
111 	 * Public setter for the quoteCharacter. The quote character can be used to
112 	 * extend a field across line endings or to enclose a String which contains
113 	 * the delimiter. Inside a quoted token the quote character can be used to
114 	 * escape itself, thus "a""b""c" is tokenized to a"b"c.
115 	 *
116 	 * @param quoteCharacter the quoteCharacter to set
117 	 *
118 	 * @see #DEFAULT_QUOTE_CHARACTER
119 	 */
120 	public final void setQuoteCharacter(char quoteCharacter) {
121 		this.quoteCharacter = quoteCharacter;
122 		this.quoteString = "" + quoteCharacter;
123 	}
124 
125 	/**
126 	 * Yields the tokens resulting from the splitting of the supplied
127 	 * <code>line</code>.
128 	 *
129 	 * @param line the line to be tokenized
130 	 *
131 	 * @return the resulting tokens
132 	 */
133 	@Override
134 	protected List<String> doTokenize(String line) {
135 
136 		List<String> tokens = new ArrayList<String>();
137 
138 		// line is never null in current implementation
139 		// line is checked in parent: AbstractLineTokenizer.tokenize()
140 		char[] chars = line.toCharArray();
141 		boolean inQuoted = false;
142 		int lastCut = 0;
143 		int length = chars.length;
144 		int fieldCount = 0;
145 
146 		for (int i = 0; i < length; i++) {
147 
148 			char currentChar = chars[i];
149 			boolean isEnd = (i == (length - 1));
150 
151 			boolean isDelimiter = isDelimiter(chars, i, delimiter);
152 
153 			if ((isDelimiter && !inQuoted) || isEnd) {
154 				int endPosition = (isEnd ? (length - lastCut) : (i - lastCut));
155 
156 				if (isEnd && isDelimiter) {
157 					endPosition--;
158 				}
159 				else if (!isEnd){
160 					endPosition = (endPosition - delimiter.length()) + 1;
161 				}
162 
163 				if (includedFields == null || includedFields.contains(fieldCount)) {
164 					String value = maybeStripQuotes(new String(chars, lastCut, endPosition));
165 					tokens.add(value);
166 				}
167 
168 				fieldCount++;
169 
170 				if (isEnd && (isDelimiter)) {
171 					if (includedFields == null || includedFields.contains(fieldCount)) {
172 						tokens.add("");
173 					}
174 					fieldCount++;
175 				}
176 
177 				lastCut = i + 1;
178 			}
179 			else if (isQuoteCharacter(currentChar)) {
180 				inQuoted = !inQuoted;
181 			}
182 
183 		}
184 
185 		return tokens;
186 	}
187 
188 	/**
189 	 * If the string is quoted strip (possibly with whitespace outside the
190 	 * quotes (which will be stripped), replace escaped quotes inside the
191 	 * string. Quotes are escaped with double instances of the quote character.
192 	 *
193 	 * @param string
194 	 * @return the same string but stripped and unescaped if necessary
195 	 */
196 	private String maybeStripQuotes(String string) {
197 		String value = string.trim();
198 		if (isQuoted(value)) {
199 			value = StringUtils.replace(value, "" + quoteCharacter + quoteCharacter, "" + quoteCharacter);
200 			int endLength = value.length() - 1;
201 			// used to deal with empty quoted values
202 			if (endLength == 0) {
203 				endLength = 1;
204 			}
205 			value = value.substring(1, endLength);
206 			return value;
207 		}
208 		return string;
209 	}
210 
211 	/**
212 	 * Is this string surrounded by quote characters?
213 	 *
214 	 * @param value
215 	 * @return true if the value starts and ends with the
216 	 * {@link #quoteCharacter}
217 	 */
218 	private boolean isQuoted(String value) {
219 		if (value.startsWith(quoteString) && value.endsWith(quoteString)) {
220 			return true;
221 		}
222 		return false;
223 	}
224 
225 	/**
226 	 * Is the supplied character the delimiter character?
227 	 *
228 	 * @param c the character to be checked
229 	 * @return <code>true</code> if the supplied character is the delimiter
230 	 * character
231 	 * @see DelimitedLineTokenizer#DelimitedLineTokenizer(char)
232 	 */
233 	private boolean isDelimiter(char[] chars, int i, String token) {
234 		boolean result = false;
235 
236 		if(i >= token.length()) {
237 			String end = new String(chars, (i-token.length()) + 1, token.length());
238 			if(token.equals(end)) {
239 				result = true;
240 			}
241 		}
242 
243 		return result;
244 	}
245 
246 	/**
247 	 * Is the supplied character a quote character?
248 	 *
249 	 * @param c the character to be checked
250 	 * @return <code>true</code> if the supplied character is an quote character
251 	 * @see #setQuoteCharacter(char)
252 	 */
253 	protected boolean isQuoteCharacter(char c) {
254 		return c == quoteCharacter;
255 	}
256 }