zoukankan      html  css  js  c++  java
  • python爬取网业信息案例

    需求:爬取网站上的公司信息

    代码如下:

    import json
    import os
    import shutil
    import requests
    import re
    import time
    
    requests.packages.urllib3.disable_warnings()
    
    #通过url请求接口,获取返回数据
    def getPage(url,headers):
        try:
            response = requests.get(url=url, headers=headers, verify=False)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                #print (response.text)
                return response.text
            else:
                print('请求异常:{} status:{}'.format(url, response.status_code))
        except Exception as e:
                print('请求异常: {} error: {}'.format(url, e))
                return None
    
    #删除文件的重复行
    def file2uniq(file,destpath):
        sum = 0
        sum_pre = 0
        addrs = set()
        with open(file, 'r',encoding='utf8') as scan_file:
            for line in scan_file.readlines():
                sum_pre += 1
                # addr = get_addr(line)
                # line.decode('utf8')
                addrs.add(line)
        scan_file.close()
        with open(destpath, 'w',encoding='utf8') as infile:
            while len(addrs) > 0:
                sum += 1
                infile.write(addrs.pop())
        infile.close()
    
        if (os.path.exists(file)):
            os.remove(file)
        try:
            os.rename(destpath, file)
        except Exception as e:
            print (e)
            print ('rename file fail
    ')
        else:
            print ('rename file success
    ')
        #print(addrs)
        print("去重之前文本条数: "+str(sum_pre))
        print("去重之后文本条数: "+str(sum))
        return sum_pre,sum
    
    
    #通过正则表达式提取页面内容
    def parseHtml(html):
        #pattern = re.compile(r'<tr> <td class="tx">.+s(.+)', re.I)  # 不区分大小写 匹配股票名称
    
        # 不区分大小写  获取完整公司名
        pattern = re.compile(r'<td class="text-center">.+</td> <td> <a href="/firm_.+">s(.+)', re.I)
    
        # 获取证券公司
        #pattern = re.compile(r'	(.+)[s]+</a> </td> <td class="text-center">.+</td> <td class="text-center">.+</td> </tr>', re.I)
        #pattern = re.compile(r'	(.+)s							  </a> </td> <td class="text-center">.+</td> <td class="text-center">.+</td> </tr> <tr> <td class="tx">', re.I)  # 不区分大小写
        #pattern = re.compile(r'</a>s</td>s<td class="text-center">.+</td> <td> <a href="/firm_.+.html">s(.+)[s]+</a> </td> <td> <a href="/firm_.+.html">s(.+)', re.I)  # 不区分大小写 匹配股票名称
    
        items = re.findall(pattern, html)
        #print (items)
        for item in items:
            yield {
                'orgName': item.strip(),
            }
    
    def write2txt(content):
        with open(file, 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    
    def removeStr(old_str,new_str):
        """
        with open('sanban.txt', 'a', encoding='utf-8') as fpr:
            content = fpr.read()
        content = content.replace(r'{"orgName": "', '')
        content = content.replace(r'"}', '')
        """
        file_data = ""
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                if old_str in line:
                    line = line.replace(old_str,new_str)
                file_data += line
        with open(file, 'w', encoding='utf-8') as f:
            f.write(file_data)
    
    
    def main(page):
        #url = 'https://www.qichacha.com/elib_sanban.html?p=' + str(page)
        url = 'https://www.qichacha.com/elib_ipo.html?p=' + str(page)  # https://www.qichacha.com/elib_ipo.html?p=2
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        }
        print (url)
        html = getPage(url,headers)
        #print (html)
        for item in parseHtml(html):
            print(item)
            write2txt(item)
        removeStr(r'{"orgName": "','')
        removeStr(r'"}', '')
    
        file2uniq(file, destpath)
    
    if __name__ == '__main__':
        file = r'orgName.txt'
        #file = r'midOrg.txt'
        #sourcepath = r'sanban.txt'
        destpath = r'temp.txt'
        for page in range(1,2):
            main(page)
            time.sleep(1)

  • 相关阅读:
    启动Mysql后找不到服务或出现找不到指定文件
    WEB-MVC模式图示
    Java中Map集合的遍历方式
    sun.misc.BASE64Encoder找不到jar包的解决方法
    Tomcat常用的网站发布方式
    Sql Server查询行号
    Mysql下载安装问题
    【数学】环逆序
    【搜索】【the first editoral】OpenJudge 我是最快的马
    The First Blog
  • 原文地址:https://www.cnblogs.com/qmfsun/p/11322893.html
Copyright © 2011-2022 走看看