zoukankan      html  css  js  c++  java
  • emoji处理方法

    在做微信公众号开发时碰到了获取微信基本信息的需求,但是在像数据库保存用户昵称的时候出错了,

    出错原因是微信用户的昵称中包含emoji等特殊符号,表情图片,

    mysql数据库使用的是utf8,最大存储3个字节,而emoji等以4个字节进行的保存,所以保存不了

    处理方法:

    1:修改数据库编码由utf8升级为utf8mb4,utf8mb4是utf8的超级,包含全部unicode编码;该方法没有具体操作;

    2:进行过滤,对获取到的用户昵称进行编码过滤,对emoji等替换为“”空;但是该方法在碰到iso上的一些emoji就失败了。在下方增加了一些处理过滤方法;

    该过滤方法找自于网上
     /**
         * 检测是否有emoji字符
         * @param source
         * @return 一旦含有就抛出
         */
        public static boolean containsEmoji(String source) {
            if (StringUtils.isBlank(source)) {
                return false;
            }
            
            int len = source.length();
            
            for (int i = 0; i < len; i++) {
                char codePoint = source.charAt(i);
                
                if (isEmojiCharacter(codePoint)) {
                    //do nothing,判断到了这里表明,确认有表情字符
                    return true;
                }
            }
            
            return false;
        }
    
    
        private static boolean isEmojiCharacter(char codePoint) {
             return (codePoint == 0x0) || 
                     (codePoint == 0x9) ||                            
                     (codePoint == 0xA) ||
                     (codePoint == 0xD) ||
                     ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) ||
                     ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) ||
                     ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF));
        }
        
        /**
         * 过滤emoji 或者 其他非文字类型的字符
         * @param source
         * @return
         */
        public static String filterEmoji(String source) {
            
            if (!containsEmoji(source)) {
           //特殊处理
           source = filterSpecialCharacter(source);
    re
    turn source;//如果不包含,直接返回 } //到这里铁定包含 StringBuilder buf = null; int len = source.length(); for (int i = 0; i < len; i++) { char codePoint = source.charAt(i); if (isEmojiCharacter(codePoint)) { if (buf == null) { buf = new StringBuilder(source.length()); } buf.append(codePoint); } else { } } if (buf == null) { return source;//如果没有找到 emoji表情,则返回源字符串 } else { if (buf.length() == len) {//这里的意义在于尽可能少的toString,因为会重新生成字符串 buf = null; return source; } else { return buf.toString(); } } }
     /** 
         * 判断特殊字符,替换成空格 
         *  
         * @param source 
         * @return 过滤后的字符串 
         */  
        public static String filterSpecialCharacter(String source) {  
            if(StringUtils.isNotBlank(source)){  
                Pattern emoji = Pattern.compile("[ud83cudc00-ud83cudfff]|[ud83dudc00-ud83dudfff]|[u2600-u27ff]",Pattern . UNICODE_CASE | Pattern . CASE_INSENSITIVE); 
                    Matcher emojiMatcher = emoji.matcher(source);
                    if (emojiMatcher.find()) {
                       return source.replaceAll("[\ud800\udc00-\udbff\udfff\ud800-\udfff]", "");
                 }else{ 
                     return source;  
                 } 
            }else{  
                return source;  
            }  
        } 

    方法3:

        /**
         * 过滤掉超过3个字节的UTF8字符
         * @param text
         * @return
         * @throws UnsupportedEncodingException
         */
        public static String filterOffUtf8Mb4(String text) throws UnsupportedEncodingException {
            byte[] bytes = text.getBytes("utf-8");
            ByteBuffer buffer = ByteBuffer.allocate(bytes.length);
            int i = 0;
            while (i < bytes.length) {
                short b = bytes[i];
                if (b > 0) {
                    buffer.put(bytes[i++]);
                    continue;
                }
    
                b += 256; // 去掉符号位
    
                if (((b >> 5) ^ 0x6) == 0) {
                    buffer.put(bytes, i, 2);
                    i += 2;
                } else if (((b >> 4) ^ 0xE) == 0) {
                    buffer.put(bytes, i, 3);
                    i += 3;
                } else if (((b >> 3) ^ 0x1E) == 0) {
                    i += 4;
                } else if (((b >> 2) ^ 0x3E) == 0) {
                    i += 5;
                } else if (((b >> 1) ^ 0x7E) == 0) {
                    i += 6;
                } else {
                    buffer.put(bytes[i++]);
                }
            }
            buffer.flip();
            return new String(buffer.array(), "utf-8");
        }


    方法4:进行编码转换保存

    将需要处理的字符串进行编码转换,存储到数据库

    /**
        * 字符串转换ascii
        */
       public static String string2Unicode(String string) {    
           StringBuffer unicode = new StringBuffer();    
           for (int i = 0; i < string.length(); i++) {    
               // 取出每一个字符
               char c = string.charAt(i);    
               // 转换为unicode
               unicode.append("\u" + Integer.toHexString(c));
           }    
           return unicode.toString();
       }
       /**
        * ascii 转字符串
        */
       public static String unicode2String(String unicode) {    
           StringBuffer string = new StringBuffer();    
           String[] hex = unicode.split("\\u");    
           for (int i = 1; i < hex.length; i++) {    
               // 转换出每一个代码点
               int data = Integer.parseInt(hex[i], 16);    
               // 追加成string
               string.append((char) data);
           }    
           return string.toString();
       }

    在页面获取的时候进行处理

    //js ascii转string
    function ascii2native(){
        //var character=document.getElementById("nikeunicode").value.split("\u");
        var x=document.getElementsByClassName("nikeunicode");
        
        var k;
        for (k = 0; k < x.length; k++) {
            console.log(x[k].innerHTML);
            var character=x[k].innerHTML.split("\u");
            var native=character[0];
            console.log(native);
            for(var i=1;i<character.length;i++){
                var code=character[i];
                native+=String.fromCharCode(parseInt("0x"+code.substring(0,4)));
                if(code.length>4){
                    native+=code.substring(4,code.length);
                }
            }
            x[k].innerHTML=native;
        }
        
        //document.getElementById("nikeunicode").value=native1;
    }

    页面处理过的效果

    上文中的方法在android输入法自带的emoji下,没有起到效果,在上文方法2的if (isEmojiCharacter(codePoint)) 处加入下列判断

    private static boolean isChinese(char c) {
            Character.UnicodeScript sc = Character.UnicodeScript.of(c);
            if (sc == Character.UnicodeScript.HAN) {
                return true;
            }
            return false;
        }
        
        public static boolean isPunctuation(char c) {
            Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
            if (    // punctuation, spacing, and formatting characters
                    ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
                    // symbols and punctuation in the unified Chinese, Japanese and Korean script
                    || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
                    // fullwidth character or a halfwidth character
                    || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
                    // vertical glyph variants for east Asian compatibility
                    || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS
                    // vertical punctuation for compatibility characters with the Chinese Standard GB 18030
                    || ub == Character.UnicodeBlock.VERTICAL_FORMS
                    // ascii
                    || ub == Character.UnicodeBlock.BASIC_LATIN
                    ) {
                return true;
            } else {
                return false;
            }
        }
        
        private static Boolean isUserDefined(char c) {
            Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
            if (ub == Character.UnicodeBlock.NUMBER_FORMS
                    || ub == Character.UnicodeBlock.ENCLOSED_ALPHANUMERICS
                    || ub == Character.UnicodeBlock.LETTERLIKE_SYMBOLS
                    || c == 'ufeff'
                    || c == 'u00a0'
                    )
                return true;
            return false;
        }
        
        public static boolean isMessy(String str)  {
            float chlength = 0;
            float count = 0;
            for(int i = 0; i < str.length(); i++) {
                char c = str.charAt(i);
                if(isPunctuation(c) || isUserDefined(c))
                    continue;
                else {
                    if(!isChinese(c)) {
                        count = count + 1;
                    }
                    chlength ++;
                }
            }
            float result = count / chlength;
            if(result > 0.3){
                return true;
            }else{
                return false;    
            }
                
            
        }
  • 相关阅读:
    HDU 6333.Problem B. Harvest of Apples-组合数C(n,0)到C(n,m)求和-组合数学(逆元)+莫队 ((2018 Multi-University Training Contest 4 1002))
    HDU 6330.Problem L. Visual Cube-模拟到上天-输出立方体 (2018 Multi-University Training Contest 3 1012)
    HDU 6326.Problem H. Monster Hunter-贪心(优先队列)+流水线排序+路径压缩、节点合并(并查集) (2018 Multi-University Training Contest 3 1008)
    杭电1518 Square(构成正方形) 搜索
    POJ1659 Frogs' Neighborhood(青蛙的邻居) Havel-Hakimi定理
    杭电1133 排队买票 catalan
    hdu 5945 Fxx and game 单调队列优化dp
    Codeforces Round #278 (Div. 2) D. Strip 线段树优化dp
    hdu 4348 To the moon 主席树区间更新
    hdu 4417 Super Mario 树状数组||主席树
  • 原文地址:https://www.cnblogs.com/-lpf/p/5563015.html
Copyright © 2011-2022 走看看