charset_utils.hpp 1.85 KB
/* 
 * File:   charset_utils.hpp
 * Author: lennyn
 *
 * Created on November 15, 2013, 1:57 PM
 */

#ifndef CHARSET_UTILS_HPP
#define	CHARSET_UTILS_HPP

#include <string>
#include <vector>
#include <algorithm>
#include "CharsetConverter.hpp"

static inline std::vector<char> initializeWhitespaces() {
    std::vector<char> res(0x3000, false);
    res[0x0000] = true; // NULL
    res[0x0009] = true; // CHARACTER TABULATION
    res[0x000A] = true; // LINE FEED (LF)
    res[0x000B] = true; // LINE TABULATION
    res[0x000C] = true; // FORM FEED (FF)
    res[0x000D] = true; // CARRIAGE RETURN (CR)
    res[0x001C] = true; // INFORMATION SEPARATOR FOUR
    res[0x001D] = true; // INFORMATION SEPARATOR THREE
    res[0x001E] = true; // INFORMATION SEPARATOR TWO
    res[0x001F] = true; // INFORMATION SEPARATOR ONE
    res[0x0020] = true; // SPACE
    res[0x0085] = true; // NEXT LINE (NEL)
    res[0x00A0] = true; // NON-BREAKING SPACE
    res[0x1680] = true; // OGHAM SPACE MARK
    res[0x180E] = true; // MONGOLIAN VOWEL SEPARATOR
    res[0x2000] = true; // EN QUAD
    res[0x2001] = true; // EM QUAD
    res[0x2002] = true; // EN SPACE
    res[0x2003] = true; // EM SPACE
    res[0x2004] = true; // THREE-PER-EM SPACE
    res[0x2005] = true; // FOUR-PER-EM SPACE
    res[0x2006] = true; // SIX-PER-EM SPACE
    res[0x2007] = true; // FIGURE SPACE
    res[0x2008] = true; // PUNCTUATION SPACE
    res[0x2009] = true; // THIN SPACE
    res[0x200A] = true; // HAIR SPACE
    res[0x2028] = true; // LINE SEPARATOR
    res[0x2029] = true; // PARAGRAPH SEPARATOR
    res[0x205F] = true; // MEDIUM MATHEMATICAL SPACE
    res[0x3000] = true; // IDEOGRAPHIC SPACE
    return res;
}

inline bool isWhitespace(uint32_t codepoint) {
    static std::vector<char> whitespaces(initializeWhitespaces());
    return codepoint < whitespaces.size() && whitespaces[codepoint];
}

#endif	/* CHARSET_UTILS_HPP */