zoukankan      html  css  js  c++  java
  • utf8 string

    https://github.com/BassLC/idUTF8lib

    Idiot's UTF-8 Library

    A very (too much really) simple Utf8 library for C++

    Usage

    #include "lib/idutf8lib.hpp"
    
    Utf8String text; //Empty UTF8 object
    Utf8String utf8_text("Héĺĺò Ẃórld"); //std::string compatible constructor
    text = "Jello!"; //Supports assignment with std::string AND Utf8String objects
    text.to_string(); // == std::string("Jello!")
    
    utf8_text.size_in_chars(); // == 11
    utf8_text.size_in_bytes(); // == 18 
    
    utf8_text[0]; // == std::string("H")
    utf8_text.sub_utf8str(1,3); // == Utf8String("éĺĺ")
    

    Features

    • Decodes and parses UTF-8 strings correctly (at least until now)
    • Very lightweight and small: less than 200 newlines total (*without counting tests)

    Requirements

    • A C++14 compatible compiler

    Notes

    Makefile serves only for testing purposes.

    Uses the Catch framework for tests.

    Thanks

    UTF8-CPP

    tiny-utf8

    #ifndef UTF8_CPP
    #define UTF8_CPP
    
    #include <string>
    #include <vector>
    
    
    class Utf8String {
    
    private:
        using Utf8Struct = std::vector<std::vector<uint8_t>>;
        
        Utf8Struct content;
    
        bool is_valid_utf8_string(const std::string &string) const;
    
    public:
        Utf8String() = default;
        Utf8String(const Utf8String &) = default;
        Utf8String(Utf8Struct &&content);
        Utf8String(const std::string &string);
        ~Utf8String() = default;
        
        std::string to_string() const;
        std::size_t size_in_chars() const;
        std::size_t size_in_bytes() const;
        void clear();
        Utf8String sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance = std::string::npos) const;
    
        void operator=(const std::string &string);
        void operator=(const Utf8String &utf8_structure) noexcept;
        
        Utf8String operator+(const Utf8String &utf8_structure) const noexcept;
        void operator+=(const Utf8String &utf8_structure) noexcept;
        
        std::string operator[](const std::size_t &pos) const;
    
        friend std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept;
    
        bool operator==(const Utf8String &utf8_structure) const noexcept;
        bool operator==(const std::string &string) const noexcept;
    };
    
    #endif
    #include "idutf8lib.hpp"
    #include <iostream>
    #include <bitset>
    #include <exception>
    
    //* Private functions *
    
    bool Utf8String::is_valid_utf8_string(const std::string &string) const {
        for ( std::size_t pos = 0; pos < string.size(); ++pos ) {
            
            //IMPORTANT: The way you access a bitset object is completely backwards.
             //EXAMPLE: bitset = 0b10; bitset[0] == 0
            std::bitset<4> bits = string[pos] >> 4;
    
            //ASCII character
            if ( bits[3] == 0 ) {
                continue;
                
            //Continuation character - should NOT be here
            } else if ( bits[3] == 1 && bits[2] == 0 ){
                return false;
    
            } else {
                
                //Check number of characters
                while ( (bits <<= 1)[3] ) {
                    if ( ++pos >= string.size() ) {
                        return false;
                    }
    
                    if ( std::bitset<2>(string[pos] >> 6) != 0b10 ) {
                        return false;
                    }
                }
            }
        }
    
        return true;
    }
    
    
    //* Constructors *
    
    Utf8String::Utf8String(const std::string &string) {
        std::vector<uint8_t> utf8_char;
     
        if ( !is_valid_utf8_string(string) ) {
            throw(std::runtime_error("Invalid UTF8 String in constructor")); 
        }
        
        for ( const auto &chr : string ) {
            std::bitset<2> start_bits = (chr >> 6);
            
            if ( start_bits[1] == 0 ) {
    
                //ASCII character is pushed after making sure of the character before
                if ( !utf8_char.empty() ) {
                    content.push_back(utf8_char);
                    utf8_char.clear();
                }
                
                content.push_back(std::vector<uint8_t>(1, chr));
                continue;
                
            //If there's more than one byte    
            } else if ( start_bits == 0b11 ) {
    
                //Check to see if it has to flush the last character
                if ( !utf8_char.empty() ) {
                    content.push_back(utf8_char);
                    utf8_char.clear();
                }
            }
    
            utf8_char.push_back(chr);
        }
    
        //If last character is non-ASCII
        if ( !utf8_char.empty() ) {
            content.push_back(utf8_char);
        }
    }
    
    
    Utf8String::Utf8String(Utf8Struct &&temp) {
        content = temp;
    }
    
    
    //* Public Interface *
    
    std::string Utf8String::to_string() const {
        std::string temp;
    
        for ( const auto &chr : content ) {
            temp += std::string(chr.begin(), chr.end());
        }
    
        return temp;
    }
    
    
    std::size_t Utf8String::size_in_chars() const { return content.size(); }
    
    
    std::size_t Utf8String::size_in_bytes() const {
        std::size_t size = 0;
    
        for ( const auto &chr : content ) {
            size += chr.size();
        }
    
        return size;
    }
    
    
    void Utf8String::clear() { content.clear(); }
    
    
    Utf8String Utf8String::sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance) const {
    
        const std::size_t end_pos = (distance == std::string::npos) ? content.size() : (initial_pos + distance);
    
        // To be sure we don't try to overflow
        if ( initial_pos >= content.size() || end_pos > content.size() ){
            throw std::out_of_range("Too big substr access");
        }
    
    
        return Utf8String(Utf8Struct(content.begin()+initial_pos, content.begin()+end_pos));
    }
    
    
    //* Operators *
    
    void Utf8String::operator=(const std::string &string) {
        Utf8String temp(string);
        content = temp.content;
    }
    
    
    void Utf8String::operator=(const Utf8String &utf8_object) noexcept { content = utf8_object.content; }
    
    
    std::string Utf8String::operator[](const std::size_t &pos) const {
        if ( pos >= content.size() ) {
            throw std::out_of_range("Bad UTF-8 range access with []");
        }
    
        return std::string(content[pos].begin(), content[pos].end());
    }
    
    
    Utf8String Utf8String::operator+(const Utf8String &utf8_structure) const noexcept {
        Utf8Struct temp = content;
        temp.insert(std::end(temp), std::begin(utf8_structure.content), std::end(utf8_structure.content));
        return Utf8String(std::move(temp));
    }
    
    
    void Utf8String::operator+=(const Utf8String &utf8_structure) noexcept {
        content.insert(std::end(content), std::begin(utf8_structure.content), std::end(utf8_structure.content));
    }
    
    
    std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept{
        out << utf8_structure.to_string();
        return out;
    }
    
    
    bool Utf8String::operator==(const Utf8String &utf8_structure) const noexcept {
        return (content == utf8_structure.content);
    }
    
    
    bool Utf8String::operator==(const std::string &string) const noexcept {
        return (this->to_string() == string);
    }
  • 相关阅读:
    我们在囧途之程序员转型记
    项目开发应遵循1+7还是7+1?
    自己用的一款模板解析程序(支持插件和扩展)(思路讨论和使用体验)
    设计的核心任务之一:层次的控制
    【设计 = 编码】 VS 【设计 ≠ 编码】
    从一生的角度看程序员的学习和发展
    编码质量与命名
    软件开发十年小史
    设计的核心任务之三:确保正交性
    国内外软件开发上的差距与分析
  • 原文地址:https://www.cnblogs.com/bodong/p/7419062.html
Copyright © 2011-2022 走看看