Clover coverage report - Clover results for XOM 1.2d1
Coverage timestamp: Wed Feb 8 2006 08:31:33 EST
file stats: LOC: 226   Methods: 3
NCLOC: 113   Classes: 1
 
 Source file Conditionals Statements Methods TOTAL
EncodingHeuristics.java 77.8% 90.5% 66.7% 85.8%
coverage coverage
 1    /* Copyright 2002, 2003, 2005 Elliotte Rusty Harold
 2   
 3    This library is free software; you can redistribute it and/or modify
 4    it under the terms of version 2.1 of the GNU Lesser General Public
 5    License as published by the Free Software Foundation.
 6   
 7    This library is distributed in the hope that it will be useful,
 8    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 10    GNU Lesser General Public License for more details.
 11   
 12    You should have received a copy of the GNU Lesser General Public
 13    License along with this library; if not, write to the
 14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 15    Boston, MA 02111-1307 USA
 16   
 17    You can contact Elliotte Rusty Harold by sending e-mail to
 18    elharo@metalab.unc.edu. Please include the word "XOM" in the
 19    subject line. The XOM home page is located at http://www.xom.nu/
 20    */
 21   
 22    package nu.xom.xinclude;
 23   
 24    import java.io.IOException;
 25    import java.io.InputStream;
 26   
 27    /**
 28    * <p>
 29    * <code>EncodingHeuristics</code> reads from a stream
 30    * (which should be buffered) and attempts to guess
 31    * what the encoding of the text in the stream is.
 32    * Byte order marks are stripped from the stream.
 33    * If it fails to determine the type of the encoding,
 34    * it returns the default UTF-8.
 35    * </p>
 36    *
 37    *
 38    * @author Elliotte Rusty Harold
 39    * @version 1.0
 40    */
 41    class EncodingHeuristics {
 42   
 43    // No instances allowed
 44  0 private EncodingHeuristics() {}
 45   
 46   
 47    /**
 48    * <p>
 49    * This utility method uses a variety of heuristics to
 50    * attempt to guess the encoding from the initial
 51    * characters.
 52    * </p>
 53    *
 54    * @param in <code>InputStream</code> to read from.
 55    * @return String The name of the encoding.
 56    * @throws IOException if the stream cannot be reset back
 57    * to where it was when the method was invoked.
 58    */
 59  24 public static String readEncodingFromStream(InputStream in)
 60    throws IOException {
 61   
 62    // This may fail if there are a lot of space
 63    // characters before the end of the encoding declaration
 64  24 in.mark(1024);
 65   
 66  24 try {
 67    // Lots of things can go wrong here. If any do,
 68    // return "UTF-8" as the default.
 69  24 int byte1 = in.read();
 70  24 int byte2 = in.read();
 71  24 if (byte1 == 0xFE && byte2 == 0xFF) {
 72    // Don't reset because the byte order mark should not be
 73    // included per section 4.3 of the XInclude spec
 74  1 return "UnicodeBig";
 75    }
 76  23 else if (byte1 == 0xFF && byte2 == 0xFE) {
 77    // Don't reset because the byte order mark should not be
 78    // included per section 4.3 of the XInclude spec
 79  1 return "UnicodeLittle";
 80    }
 81   
 82    /* In accordance with the Character Model,
 83    when the text format is a Unicode encoding, the XInclude
 84    processor must fail the inclusion when the text in the
 85    selected range is non-normalized. When transcoding
 86    characters to a Unicode encoding from a legacy encoding,
 87    a normalizing transcoder must be used. */
 88   
 89  22 int byte3 = in.read();
 90    // check for UTF-8 byte order mark
 91  22 if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
 92    // Don't reset because the byte order mark should not be
 93    // included per section 4.3 of the XInclude spec
 94  1 return "UTF-8";
 95    }
 96   
 97  21 int byte4 = in.read();
 98  21 if (byte1 == 0x00
 99    && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
 100    // Don't reset because the byte order mark should not be
 101    // included per section 4.3 of the XInclude spec
 102    // Most Java VMs don't support this next one
 103  0 return "UTF32BE";
 104    }
 105  21 else if (byte1 == 0x00 && byte2 == 0x00
 106    && byte3 == 0xFF && byte4 == 0xFE) {
 107    // Don't reset because the byte order mark should not be
 108    // included per section 4.3 of the XInclude spec
 109    // Most Java VMs don't support this next one
 110  0 return "UTF32LE";
 111    }
 112   
 113    // no byte order mark present; first character must be
 114    // less than sign or white space
 115    // Let's look for less-than signs first
 116  21 if (byte1 == 0x00 && byte2 == 0x00
 117    && byte3 == 0x00 && byte4 == '<') {
 118  0 in.reset();
 119  0 return "UTF32BE";
 120    }
 121  21 else if (byte1 == '<' && byte2 == 0x00
 122    && byte3 == 0x00 && byte4 == 0x00) {
 123  0 in.reset();
 124  0 return "UTF32LE";
 125    }
 126  21 else if (byte1 == 0x00 && byte2 == '<'
 127    && byte3 == 0x00 && byte4 == '?') {
 128  1 in.reset();
 129  1 return "UnicodeBigUnmarked";
 130    }
 131  20 else if (byte1 == '<' && byte2 == 0x00
 132    && byte3 == '?' && byte4 == 0x00) {
 133  1 in.reset();
 134  1 return "UnicodeLittleUnmarked";
 135    }
 136  19 else if (byte1 == '<' && byte2 == '?'
 137    && byte3 == 'x' && byte4 == 'm') {
 138    // ASCII compatible, must read encoding declaration.
 139    // 1024 bytes will be far enough to read most
 140    // XML declarations
 141  2 byte[] data = new byte[1024];
 142  2 data[0] = (byte) byte1;
 143  2 data[1] = (byte) byte2;
 144  2 data[2] = (byte) byte3;
 145  2 data[3] = (byte) byte4;
 146  2 int length = in.read(data, 4, 1020) + 4;
 147    // Use Latin-1 (ISO-8859-1) because it's ASCII compatible
 148    // and all byte sequences are legal Latin-1 sequences
 149    // so I don't have to worry about encoding errors if I
 150    // slip past the end of the XML/text declaration
 151  2 String declaration=new String(data, 0, length, "8859_1");
 152    // If any of these throw a
 153    // StringIndexOutOfBoundsException,
 154    // we just fall into the catch block and return null
 155    // since this can't be well-formed XML
 156  2 String encoding = findEncodingDeclaration(declaration);
 157  1 in.reset();
 158  1 return encoding;
 159   
 160    }
 161  17 else if (byte1 == 0x4C && byte2 == 0x6F
 162    && byte3 == 0xA7 && byte4 == 0x94) {
 163    // EBCDIC compatible, must read encoding declaration
 164  1 byte[] buffer = new byte[1016];
 165  47 for (int i = 0; i < buffer.length; i++) {
 166  47 int c = in.read();
 167  1 if (c == -1) break;
 168  46 buffer[i] = (byte) c;
 169    }
 170  1 in.reset();
 171    // Most EBCDIC encodings are compatible with Cp037 over
 172    // the range we care about
 173  1 return findEncodingDeclaration(new String(buffer, "Cp037"));
 174    }
 175   
 176    }
 177    catch (Exception ex) {
 178  1 in.reset();
 179  1 return "UTF-8";
 180    }
 181   
 182    // no XML or text declaration present
 183  16 in.reset();
 184  16 return "UTF-8";
 185   
 186    }
 187   
 188   
 189  3 private static String findEncodingDeclaration(String declaration)
 190    throws IOException {
 191   
 192  3 int position = declaration.indexOf("encoding") + 8;
 193  3 char c;
 194    // get rid of white space before equals sign
 195  3 while (true) {
 196  3 c = declaration.charAt(position++);
 197  3 if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
 198  3 break;
 199    }
 200    }
 201  3 if (c != '=') { // malformed
 202  1 throw new IOException("Couldn't determine encoding");
 203    }
 204    // get rid of white space after equals sign
 205  2 while (true) {
 206  2 c = declaration.charAt(position++);
 207  2 if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {
 208  2 break;
 209    }
 210    }
 211  2 char delimiter = c;
 212  2 if (delimiter != '\'' && delimiter != '"') { // malformed
 213  0 return "UTF-8";
 214    }
 215    // now positioned to read encoding name
 216  2 StringBuffer encodingName = new StringBuffer();
 217  2 while (true) {
 218  12 c = declaration.charAt(position++);
 219  2 if (c == delimiter) break;
 220  10 encodingName.append(c);
 221    }
 222  2 return encodingName.toString();
 223   
 224    }
 225   
 226    }