zoukankan html css js c++ java

Python HTML特殊符号的转义与反转义

需求：在做Web开发过程中，经常遇到特殊符号需要转义为浏览器认为是字符串的数据，减少前端的攻击。
注意：此代码来源Tornado源码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import html.entities
import typing
from typing import Union, Optional, Dict

_TO_UNICODE_TYPES = (str, type(None))

def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:  # noqa: F811
    """将字节转为字符串"""
    if isinstance(value, _TO_UNICODE_TYPES):
        return value
    if not isinstance(value, bytes):
        raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
    return value.decode("utf-8")

_XHTML_ESCAPE_RE = re.compile("[&<>"']")
_XHTML_ESCAPE_DICT = {
    "&": "&amp;",
    "<": "&lt;",
    ">": "&gt;",
    '"': "&quot;",
    "'": "&#39;",
}

def xhtml_escape(value: Union[str, bytes]) -> str:
    """将特殊符号：``<``, ``>``, ``"``, ``'``, and ``&``，进行转义"""
    return _XHTML_ESCAPE_RE.sub(
        lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_unicode(value)
    )

def _build_unicode_map() -> Dict[str, str]:
    """
        打印出html所有的特殊符号与转义后的简称
    :return:
    """
    unicode_map = {}
    for name, value in html.entities.name2codepoint.items():
        unicode_map[name] = chr(value)
    return unicode_map


_HTML_UNICODE_MAP = _build_unicode_map()

def _convert_entity(m: typing.Match) -> str:
    """
        re.sub回调函数
    """
    if m.group(1) == "#":
        try:
            if m.group(2)[:1].lower() == "x":
                return chr(int(m.group(2)[1:], 16))
            else:
                return chr(int(m.group(2)))
        except ValueError:
            return "&#%s;" % m.group(2)
    try:
        return _HTML_UNICODE_MAP[m.group(2)]
    except KeyError:
        return "&%s;" % m.group(2)


def xhtml_unescape(value: Union[str, bytes]) -> str:
    """将转义字符，返转义为特殊符号."""
    return re.sub(r"&(#?)(w+?);", _convert_entity, to_unicode(value))

if __name__ == '__main__':
    src_text = '<script>alert(1)</script>'
    ret_escape = xhtml_escape(src_text)
    print(ret_escape)
    reback = xhtml_unescape(ret_escape)
    print(reback)

    """
    输出结果：
        &lt;script&gt;alert(1)&lt;/script&gt;
        <script>alert(1)</script>
    """

查看全文

相关阅读:
IP负载均衡技术
 ES6 克隆对象浅克隆：只能克隆原始对象自身的值，不能克隆它继承的值
 多层nginx中的压缩问题 api接口>1M数据的返回浏览器网关
 Status Code: 431 Request Header Fields Too Large
研发过程中的测试工作
 dede文章页调用当前栏目链接方法
 dedecms做好的网站怎么上传到网上？
如何修改＂DEDECMS 提示信息!＂方法!
dedecms搜索提示＂关键字不能小于2个字节！＂
修改cms版权等等信息

原文地址：https://www.cnblogs.com/ygbh/p/14061489.html