001 /*--------------------------------------------------------------------------+
002 $Id: EByteOrderMark.java 29722 2010-08-16 13:40:26Z deissenb $
003 | |
004 | Copyright 2005-2010 Technische Universitaet Muenchen |
005 | |
006 | Licensed under the Apache License, Version 2.0 (the "License"); |
007 | you may not use this file except in compliance with the License. |
008 | You may obtain a copy of the License at |
009 | |
010 | http://www.apache.org/licenses/LICENSE-2.0 |
011 | |
012 | Unless required by applicable law or agreed to in writing, software |
013 | distributed under the License is distributed on an "AS IS" BASIS, |
014 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
015 | See the License for the specific language governing permissions and |
016 | limitations under the License. |
017 +--------------------------------------------------------------------------*/
018 package edu.tum.cs.commons.filesystem;
019
020 import java.util.Arrays;
021
022 import edu.tum.cs.commons.assertion.CCSMAssert;
023 import edu.tum.cs.commons.collections.ArrayUtils;
024
025 /**
026 * Enumeration of the UTF byte order marks (BOM). The actual values are taken
027 * from http://unicode.org/faq/utf_bom.html
028 * <p>
029 * The order of the values in this enum is chosen such that BOMs that are a
030 * prefix of other BOMs are at the end, i.e. UTF-32 is before UTF-16. This way
031 * we can check the BOM prefix in the order of the enum values' appearance.
032 *
033 * @author hummelb
034 * @author $Author: deissenb $
035 * @version $Rev: 29722 $
036 * @levd.rating GREEN Hash: 2AAB6CBCE60BACE98E4803B711962593
037 */
038 public enum EByteOrderMark {
039
040 /** UTF-32 with big endian encoding. */
041 UTF_32BE("UTF-32BE", new byte[] { 0x00, 0x00, (byte) 0xFE, (byte) 0xFF }),
042
043 /** UTF-32 with little endian encoding. */
044 UTF_32LE("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, 0x00, 0x00 }),
045
046 /** UTF-16 with big endian encoding. */
047 UTF_16BE("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF }),
048
049 /** UTF-16 with little endian encoding. */
050 UTF_16LE("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE }),
051
052 /**
053 * UTF-8. Note that for UTF-8 the endianess is not relevant and that the BOM
054 * is optional.
055 */
056 UTF_8_BOM("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
057
058 /** The maximal length of a BOM. */
059 public static final int MAX_BOM_LENGTH = 4;
060
061 /** The name of the encoding */
062 private final String encoding;
063
064 /** The byte order mark. */
065 private final byte[] bom;
066
067 /** Constructor. */
068 private EByteOrderMark(String encoding, byte[] bom) {
069 this.encoding = encoding;
070 CCSMAssert.isTrue(bom.length <= MAX_BOM_LENGTH,
071 "Inconsistent max BOM length!");
072 this.bom = bom;
073 }
074
075 /** Returns the encoding. */
076 public String getEncoding() {
077 return encoding;
078 }
079
080 /**
081 * Returns the byte order mark. This returns a copy, so the array may be
082 * modified.
083 */
084 public byte[] getBOM() {
085 return Arrays.copyOf(bom, bom.length);
086 }
087
088 /** Returns the size of the BOM in bytes. */
089 public int getBOMLength() {
090 return bom.length;
091 }
092
093 /**
094 * This method checks the start of the provided data array to find a BOM. If
095 * a BOM is found, the corresponding enum value is returned. Otherwise,
096 * <code>null</code> is returned. If possible, the provided data should at
097 * least be of size {@value #MAX_BOM_LENGTH}. Otherwise the encoding might
098 * not be detected correctly. However, the method also works with shorter
099 * arrays (e.g. if a file consists of only 3 bytes).
100 */
101 public static EByteOrderMark determineBOM(byte[] data) {
102 for (EByteOrderMark bom : values()) {
103 if (ArrayUtils.isPrefix(bom.bom, data)) {
104 return bom;
105 }
106 }
107 return null;
108 }
109 }