zoukankan html css js c++ java

【爬虫】-爬取食品检验结果

# 需求 爬取网站上的所有的关于药品检验记录的信息,并输出到excel文件中
import requests
from bs4 import BeautifulSoup
import re


def find_all_a_tag():
    """
    考虑到只有一次使用机会，这里uel就写死好了，没有持续传参数
    :return:
    """
    url = r'http://www.nifdc.org.cn/CL0873/'
    html_t = requests.get(url)
    soup = BeautifulSoup(html_t.text, 'lxml')
    # 这里的注释：做成标签文档树的形式检查一下html文件是否爬取正确
    # res = soup.prettify()
    # print(res)
    a_list = soup.find_all(id='table297')[0].find_all('a')
    path_url = 'http://www.nifdc.org.cn/'
    url_list = []
    # 找到所有的html后缀，完成拼接
    for i in a_list:
        # print(i)
        path_url_plus = re.findall('Cw{5}', str(i))[0]
        p = path_url + path_url_plus + '/'
        url_list.append(p)
    return url_list


def single_page_get(url):
    """
    解析传入url的a标签
    :param url:
    :return:
    """
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.text, 'lxml')
    tag = soup.find_all(id='table5')[0].find_all('a')
    return tag


def get_all_a():
    """
    返回所有的和记录有关的a标签
    0.a标签内所有的C开头/结尾的剔除
    1.a标签内如果是http开头的，直接加入列表
    2.如果是..开头的，拼接上一个tag
    :return:
    """
    all_list = []
    tags = find_all_a_tag()
    for tag in tags:
        tag_t = single_page_get(tag)
        # print(tag)
        for i in tag_t:
            path_url_plus = re.findall('Cw{5}/d+.html', str(i))
            path_url_plus_1 = re.findall('http.+.htm', str(i))
            path_url_plus_2 = re.findall('attach.+.htm', str(i))
            if path_url_plus_2:
                fin_a_path2 = ('http://www.nifdc.org.cn/'+path_url_plus_2[0])
                all_list.append(fin_a_path2)
            if path_url_plus_1:
                fin_a_path1 = path_url_plus_1[0]
                all_list.append(fin_a_path1)
            # print(i)
            a_path_url_plus = re.findall('d+.html', str(path_url_plus))
            if a_path_url_plus:
                fin_a_path3 = (tag+a_path_url_plus[0])
                all_list.append(fin_a_path3)
    return all_list


def get_re():
    pass


if __name__ == '__main__':
    all_list = get_all_a()
    # print(len(all_list))
    for i in all_list:
        print(i)

爬虫源码

用爬虫的知识爬取到企业的所有的a标签内容，然后再做信息数据的提取，目前只找到了所有的a标签数据

Win a contest, win a challenge

查看全文

相关阅读:
记一次struts项目空指针异常
 struts2问题（已解决）java.nio.file.InvalidPathException: Illegal char <:> at index 3: jar:file:
Road Map
API
Report of program history
正则表达式验证用户信息
 RegExp（ replace()的示例）
DOM与BOM部分示例
 伪类与伪元素
 第三次随笔（按钮外观改变）

原文地址：https://www.cnblogs.com/pandaboy1123/p/9712656.html