View Javadoc

1   package org.apache.turbine.util.parser;
2   
3   /*
4    * Copyright 2001-2005 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License")
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.BufferedReader;
20  import java.io.IOException;
21  import java.io.InputStreamReader;
22  import java.io.Reader;
23  import java.io.StreamTokenizer;
24  
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.NoSuchElementException;
30  
31  import org.apache.commons.lang.exception.NestableRuntimeException;
32  
33  /***
34   * DataStreamParser is used to parse a stream with a fixed format and
35   * generate ValueParser objects which can be used to extract the values
36   * in the desired type.
37   *
38   * <p>The class itself is abstract - a concrete subclass which implements
39   * the initTokenizer method such as CSVParser or TSVParser is required
40   * to use the functionality.
41   *
42   * <p>The class implements the java.util.Iterator interface for convenience.
43   * This allows simple use in a Velocity template for example:
44   *
45   * <pre>
46   * #foreach ($row in $datastream)
47   *   Name: $row.Name
48   *   Description: $row.Description
49   * #end
50   * </pre>
51   *
52   * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
53   * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
54   * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
55   * @version $Id: DataStreamParser.java 280284 2005-09-12 07:57:42Z henning $
56   */
57  public abstract class DataStreamParser implements Iterator
58  {
59      /***
60       * The constant for empty fields
61       */
62      protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
63  
64      /***
65       * The list of column names.
66       */
67      private List columnNames = Collections.EMPTY_LIST;
68  
69      /***
70       * The stream tokenizer for reading values from the input reader.
71       */
72      private StreamTokenizer tokenizer;
73  
74      /***
75       * The parameter parser holding the values of columns for the current line.
76       */
77      private ValueParser lineValues;
78  
79      /***
80       * Indicates whether or not the tokenizer has read anything yet.
81       */
82      private boolean neverRead = true;
83  
84      /***
85       * The character encoding of the input
86       */
87      private String characterEncoding;
88  
89      /***
90       * The fieldseperator, which can be almost any char
91       */
92      private char fieldSeparator;
93  
94      /***
95       * Create a new DataStreamParser instance. Requires a Reader to read the
96       * comma-separated values from, a list of column names and a
97       * character encoding.
98       *
99       * @param in the input reader.
100      * @param columnNames a list of column names.
101      * @param characterEncoding the character encoding of the input.
102      */
103     public DataStreamParser(Reader in, List columnNames,
104                             String characterEncoding)
105     {
106         setColumnNames(columnNames);
107 
108         this.characterEncoding = characterEncoding;
109 
110         if (this.characterEncoding == null)
111         {
112             if (in instanceof InputStreamReader)
113             {
114                 this.characterEncoding = ((InputStreamReader) in).getEncoding();
115             }
116 
117             if (this.characterEncoding == null)
118             {
119                 // try and get the characterEncoding from the reader
120                 this.characterEncoding = "US-ASCII";
121             }
122         }
123 
124         tokenizer = new StreamTokenizer(new BufferedReader(in));
125         initTokenizer(tokenizer);
126     }
127 
128     /***
129      * Initialize the StreamTokenizer instance used to read the lines
130      * from the input reader. This must be implemented in subclasses to
131      * set up other tokenizing properties.
132      *
133      * @param tokenizer the tokenizer to adjust
134      */
135     protected void initTokenizer(StreamTokenizer tokenizer)
136     {
137         tokenizer.resetSyntax();
138 
139         // leave out the comma sign (,), we need it for empty fields
140         tokenizer.wordChars(' ', Character.MAX_VALUE);
141 
142         // and  set the quote mark as the quoting character
143         tokenizer.quoteChar('"');
144 
145         // and finally say that end of line is significant
146         tokenizer.eolIsSignificant(true);
147     }
148 
149     /***
150      * This method must be called to setup the field seperator
151      * @param fieldSeparator the char which separates the fields
152      */
153     public void setFieldSeparator(char fieldSeparator)
154     {
155         this.fieldSeparator = fieldSeparator;
156         // make this field also an ordinary char by default.
157         tokenizer.ordinaryChar(fieldSeparator);
158     }
159 
160     /***
161      * Set the list of column names explicitly.
162      *
163      * @param columnNames A list of column names.
164      */
165     public void setColumnNames(List columnNames)
166     {
167         if (columnNames != null)
168         {
169             this.columnNames = columnNames;
170         }
171     }
172 
173     /***
174      * get the list of column names.
175      *
176      */
177     public List getColumnNames()
178     {
179         return columnNames;
180     }
181 
182     /***
183      * Read the list of column names from the input reader using the
184      * tokenizer. If fieldNames are empty, we use the current fieldNumber
185      * + the EMPTYFIELDNAME to make one up.
186      *
187      * @exception IOException an IOException occurred.
188      */
189     public void readColumnNames()
190             throws IOException
191     {
192         List columnNames = new ArrayList();
193         int fieldCounter = 0;
194 
195         if (hasNextRow())
196         {
197             String colName = null;
198             boolean foundEol = false;
199 
200             while(!foundEol)
201             {
202                 tokenizer.nextToken();
203 
204                 if (tokenizer.ttype == '"'
205                         || tokenizer.ttype == StreamTokenizer.TT_WORD)
206                 {
207                     // tokenizer.ttype is either '"' or TT_WORD
208                     colName = tokenizer.sval;
209                 }
210                 else
211                 {
212                     // fieldSeparator, EOL or EOF
213                     fieldCounter++;
214 
215                     if (colName == null)
216                     {
217                         colName = EMPTYFIELDNAME + fieldCounter;
218                     }
219 
220                     columnNames.add(colName);
221                     colName = null;
222                 }
223 
224                 // EOL and EOF are checked independently from existing fields.
225                 if (tokenizer.ttype == StreamTokenizer.TT_EOL)
226                 {
227                     foundEol = true;
228                 }
229                 else if (tokenizer.ttype == StreamTokenizer.TT_EOF)
230                 {
231                     // Keep this token in the tokenizer for hasNext()
232                     tokenizer.pushBack();
233                     foundEol = true;
234                 }
235             }
236 
237             setColumnNames(columnNames);
238         }
239     }
240 
241     /***
242      * Determine whether a further row of values exists in the input.
243      *
244      * @return true if the input has more rows.
245      * @exception IOException an IOException occurred.
246      */
247     public boolean hasNextRow()
248             throws IOException
249     {
250         // check for end of line ensures that an empty last line doesn't
251         // give a false positive for hasNextRow
252         if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
253         {
254             tokenizer.nextToken();
255             tokenizer.pushBack();
256             neverRead = false;
257         }
258         return tokenizer.ttype != StreamTokenizer.TT_EOF;
259     }
260 
261     /***
262      * Returns a ValueParser object containing the next row of values.
263      *
264      * @return a ValueParser object.
265      * @exception IOException an IOException occurred.
266      * @exception NoSuchElementException there are no more rows in the input.
267      */
268     public ValueParser nextRow()
269             throws IOException, NoSuchElementException
270     {
271         if (!hasNextRow())
272         {
273             throw new NoSuchElementException();
274         }
275 
276         if (lineValues == null)
277         {
278             lineValues = new BaseValueParser(characterEncoding);
279         }
280         else
281         {
282             lineValues.clear();
283         }
284 
285         Iterator it = columnNames.iterator();
286 
287         String currVal = "";
288         String colName = null;
289 
290         boolean foundEol = false;
291         while (!foundEol || it.hasNext())
292         {
293             if (!foundEol)
294             {
295                 tokenizer.nextToken();
296             }
297 
298             if (colName == null && it.hasNext())
299             {
300                 colName = String.valueOf(it.next());
301             }
302 
303             if (tokenizer.ttype == '"'
304                     || tokenizer.ttype == StreamTokenizer.TT_WORD)
305             {
306                 // tokenizer.ttype is either '"' or TT_WORD
307                 currVal = tokenizer.sval;
308             }
309             else
310             {
311                 // fieldSeparator, EOL or EOF
312                 lineValues.add(colName, currVal);
313                 colName = null;
314                 currVal = "";
315             }
316 
317             // EOL and EOF are checked independently from existing fields.
318             if (tokenizer.ttype == StreamTokenizer.TT_EOL)
319             {
320                 foundEol = true;
321             }
322             else if (tokenizer.ttype == StreamTokenizer.TT_EOF)
323             {
324                 // Keep this token in the tokenizer for hasNext()
325                 tokenizer.pushBack();
326                 foundEol = true;
327             }
328         }
329 
330         return lineValues;
331     }
332 
333     /***
334      * Determine whether a further row of values exists in the input.
335      *
336      * @return true if the input has more rows.
337      */
338     public boolean hasNext()
339     {
340         boolean hasNext = false;
341 
342         try
343         {
344             hasNext = hasNextRow();
345         }
346         catch (IOException e)
347         {
348             throw new NestableRuntimeException(e);
349         }
350 
351         return hasNext;
352     }
353 
354     /***
355      * Returns a ValueParser object containing the next row of values.
356      *
357      * @return a ValueParser object as an Object.
358      * @exception NoSuchElementException there are no more rows in the input
359      *                                   or an IOException occurred.
360      */
361     public Object next()
362             throws NoSuchElementException
363     {
364         Object nextRow = null;
365 
366         try
367         {
368             nextRow = nextRow();
369         }
370         catch (IOException e)
371         {
372             throw new NestableRuntimeException(e);
373         }
374 
375         return nextRow;
376     }
377 
378     /***
379      * The optional Iterator.remove method is not supported.
380      *
381      * @exception UnsupportedOperationException the operation is not supported.
382      */
383     public void remove()
384             throws UnsupportedOperationException
385     {
386         throw new UnsupportedOperationException();
387     }
388 }