zoukankan html css js c++ java

python 爬虫 requests+BeautifulSoup 爬取巨潮资讯公司概况代码实例

第一次写一个算是比较完整的爬虫，自我感觉极差啊，代码low，效率差，也没有保存到本地文件或者数据库，强行使用了一波多线程导致数据顺序发生了变化。。。

贴在这里，引以为戒吧。

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 21:41:34 2018
@author: brave-man
blog: http://www.cnblogs.com/zrmw/
"""

import requests
from bs4 import BeautifulSoup
import json
from threading import Thread

# 获取上市公司的全称，英文名称，地址，法定代表人（也可以获取任何想要获取的公司信息）
def getDetails(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"}
    res = requests.get("{}".format(url), headers = headers)
    res.encoding = "GBK"
    soup = BeautifulSoup(res.text, "html.parser")

    details = {"code": soup.select(".table")[0].td.text.lstrip("股票代码：")[:6], 
               "Entire_Name": soup.select(".zx_data2")[0].text.strip("
 "), 
               "English_Name": soup.select(".zx_data2")[1].text.strip("
 "), 
               "Address": soup.select(".zx_data2")[2].text.strip("
 "), 
               "Legal_Representative": soup.select(".zx_data2")[4].text.strip("
 ")}
    # 这里将details转换成json字符串格式用作后期存储处理
    jd = json.dumps(details)
    jd1 = json.loads(jd)
    print(jd1)

# 此函数用来获取上市公司的股票代码
def getCode():
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"}
    res = requests.get("http://www.cninfo.com.cn/cninfo-new/information/companylist", headers = headers)
    res.encoding = "gb1232"
    soup = BeautifulSoup(res.text, "html.parser")
#    print(soup.select(".company-list"))
    L = []
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    for i in soup.select(".company-list")[0].find_all("a"):
        code = i.text[:6]
        l1.append(code)
    for i in soup.select(".company-list")[1].find_all("a"):
        code = i.text[:6]
        l2.append(code)
    for i in soup.select(".company-list")[2].find_all("a"):
        code = i.text[:6]
        l3.append(code)
    for i in soup.select(".company-list")[3].find_all("a"):
        code = i.text[:6]
        l4.append(code)
    L = [l1, l2, l3, l4]
    print(L[0])
    return getAll(L)

def getAll(L):
    def t1(L):
        for i in L[0]:
            url_sszb = "http://www.cninfo.com.cn/information/brief/szmb{}.html".format(i)
            getDetails(url_sszb)
    def t2(L):
        for i in L[1]:
            url_zxqyb = "http://www.cninfo.com.cn/information/brief/szsme{}.html".format(i)
            getDetails(url_zxqyb)
    def t3(L):
        for i in L[2]:
            url_cyb = "http://www.cninfo.com.cn/information/brief/szcn{}.html".format(i)
            getDetails(url_cyb)
    def t4(L):
        for i in L[3]:
            url_hszb = "http://www.cninfo.com.cn/information/brief/shmb{}.html".format(i)
            getDetails(url_hszb)
#    tt1 = Thread(target = t1, args = (L, ))
#    tt2 = Thread(target = t2, args = (L, ))        
#    tt3 = Thread(target = t3, args = (L, ))
#    tt4 = Thread(target = t4, args = (L, ))
#    
#    tt1.start()
#    tt2.start()
#    tt3.start()
#    tt4.start()
#    
#    tt1.join()
#    tt2.join()
#    tt3.join()
#    tt4.join()
    t1(L)
    t2(L)
    t3(L)
    t4(L)

if __name__ == "__main__":
    getCode()

没有考虑实际生产中突发的状况，比如网速延迟卡顿等问题。

速度是真慢，有时间会分享给大家 selenium + 浏览器的爬取巨潮资讯的方法代码。晚安~

查看全文

相关阅读:
P4342 [IOI1998]Polygon
P1194 买礼物
 P1363 幻想迷宫
 Installing Wine 1.5: configure: error: Cannot build a 32-bit program, you need to install 32-bit development libraries（转载）
Linux系统调用之open(), close() （转载）
undefined reference to 'pthread_create'问题解决(转载）
linux中的C里面使用pthread_mutex_t锁（转载）
#if、#ifdef、#if defined之间的区别（转载）
linux下解压tgz文件(转载)
linux修改用户主目录的方法 (转载)

原文地址：https://www.cnblogs.com/zrmw/p/9333385.html