zoukankan      html  css  js  c++  java
  • pyquery的中文编码问题

    # coding=UTF-8
    import urllib.request
    import pyquery
    import requests
    import time
    import json
    import pymysql
    import sys
    import math
    from datetime import datetime
    import time
    import csv
    from time import sleep
    import random
    from bs4 import BeautifulSoup
    import asyncio
    from pyppeteer import launch
    import pyppeteer
    from pyquery import PyQuery as pq
    import chardet
    import codecs
    
    path='D:/code-py-download/02fang/xqxq_demo_utf8.html'
    txt = open(path, 'rb').read()
    encodings = chardet.detect(txt)['encoding']
    
    with open(path, "r", encoding=encodings)as f:
        content = f.read()
        doc = pq(content)
        name=doc('.tit.clearfix h1>strong')
        title=name.text()
        print(title) #扬州天下花园
    
    # path='D:/code-py-download/02fang/xqxq_demo_utf8.html'
    # with open(path, "r")as f:
    #     content = f.read()
    #     doc = pq(content)
    #     name=doc('.tit.clearfix h1>strong')
    #     title=name.text()
    #     print(title) #UnicodeDecodeError: 'gbk' codec can't decode byte 
    
    
    # path='D:/code-py-download/02fang/小区详情demo.txt'   #ansi编码都不行
    # with open(path, "r")as f:
    #     content = f.read()
    #     doc = pq(content)
    #     name=doc('.tit.clearfix h1>strong')
    #     title=name.text()
    #     print(title) # 扬州天下花园
    
    # path='D:/code-py-download/02fang/xqxq_demo_utf8.html'
    # doc = pq(filename=path, encoding='utf-8') 直接读也不行
    # name=doc('.tit.clearfix h1>strong')
    # title=name.text()
    # print(title) # UnicodeDecodeError: 'gbk' codec can't decode byte
    
    # path='D:/code-py-download/02fang/小区详情demo.txt'
    # doc = pq(filename=path, encoding='utf-8')
    # name=doc('.tit.clearfix h1>strong')
    # title=name.text()
    # print(title)  # æ¬å·¤©ä¸è±å
    
    # path='D:/code-py-download/02fang/小区详情demo.txt'
    # doc = pq(filename=path, encoding='gbk')
    # name=doc('.tit.clearfix h1>strong')
    # title=name.text()
    # print(title)  # æ¬å·¤©ä¸è±å
  • 相关阅读:
    STL 全排列
    Happy Programming Contest
    输入外挂
    Crazy Professor
    订票助手 12306
    我的e证空间 出入境证照可在家自拍啦!
    雨生红球藻 虾青素 寻找正规产品!
    CAR 汽车团购网站—广州
    3G路由器、无线接入点(无线AP)、无线路由器!
    微软 Lightswitch 发布 V2 Beta,引入新的逻辑层基础技术!
  • 原文地址:https://www.cnblogs.com/yansc/p/15512731.html
Copyright © 2011-2022 走看看