需求:在做Web开发过程中,经常遇到特殊符号需要转义为浏览器认为是字符串的数据,减少前端的攻击。
注意:此代码来源Tornado源码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import html.entities
import typing
from typing import Union, Optional, Dict
_TO_UNICODE_TYPES = (str, type(None))
def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811
"""将字节转为字符串"""
if isinstance(value, _TO_UNICODE_TYPES):
return value
if not isinstance(value, bytes):
raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
return value.decode("utf-8")
_XHTML_ESCAPE_RE = re.compile("[&<>"']")
_XHTML_ESCAPE_DICT = {
"&": "&",
"<": "<",
">": ">",
'"': """,
"'": "'",
}
def xhtml_escape(value: Union[str, bytes]) -> str:
"""将特殊符号:``<``, ``>``, ``"``, ``'``, and ``&``,进行转义"""
return _XHTML_ESCAPE_RE.sub(
lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_unicode(value)
)
def _build_unicode_map() -> Dict[str, str]:
"""
打印出html所有的特殊符号与转义后的简称
:return:
"""
unicode_map = {}
for name, value in html.entities.name2codepoint.items():
unicode_map[name] = chr(value)
return unicode_map
_HTML_UNICODE_MAP = _build_unicode_map()
def _convert_entity(m: typing.Match) -> str:
"""
re.sub回调函数
"""
if m.group(1) == "#":
try:
if m.group(2)[:1].lower() == "x":
return chr(int(m.group(2)[1:], 16))
else:
return chr(int(m.group(2)))
except ValueError:
return "&#%s;" % m.group(2)
try:
return _HTML_UNICODE_MAP[m.group(2)]
except KeyError:
return "&%s;" % m.group(2)
def xhtml_unescape(value: Union[str, bytes]) -> str:
"""将转义字符,返转义为特殊符号."""
return re.sub(r"&(#?)(w+?);", _convert_entity, to_unicode(value))
if __name__ == '__main__':
src_text = '<script>alert(1)</script>'
ret_escape = xhtml_escape(src_text)
print(ret_escape)
reback = xhtml_unescape(ret_escape)
print(reback)
"""
输出结果:
<script>alert(1)</script>
<script>alert(1)</script>
"""