zoukankan html css js c++ java

python 爬虫例子

#coding=UTF-8
import csv  #用于把爬取的数据存储为csv格式，可以excel直接打开的
import time  #用于对请求加延时，爬取速度太快容易被反爬
from time import sleep #同上
import random  #用于对延时设置随机数，尽量模拟人的行为
import requests  #用于向网站发送请求
from lxml import etree    #lxml为第三方网页解析库，强大且速度快

url = 'http://yz.yuzhuprice.com:8003/findPriceByName.jspx?page.curPage=1&priceName=%E7%BA%A2%E6%9C%A8%E7%B1%BB'
headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
}
response = requests.get(url, headers=headers, timeout=10)
html = response.text
print(html)

parse = etree.HTML(html)
all_tr = parse.xpath('//*[@id="173200"]')

for tr in all_tr:
    tr = {
        'name': ''.join(tr.xpath('./td[1]/text()')).strip(),
        'price': ''.join(tr.xpath('./td[2]/text()')).strip(),
        'unit': ''.join(tr.xpath('./td[3]/text()')).strip(),
        'supermaket': ''.join(tr.xpath('./td[4]/text()')).strip(),
        'time': ''.join(tr.xpath('./td[5]/text()')).strip()
    }

    print(tr)
    with open('wood.csv', 'a') as fp:
    # 'a'为追加模式（添加）
    # utf_8_sig格式导出csv不乱码
     fieldnames = ['name', 'price', 'unit', 'supermaket', 'time']
     writer = csv.DictWriter(fp, fieldnames)
     writer.writerow(tr)

yum install python3

pip install xxx#包名称

新建wood.csv为了保存文件

查看全文

相关阅读:
python 连接操作mysql数据库
 （转）postfix疯狂外发垃圾邮件之分析与解决
 ansible 常用方法
 用python2.7.9 写个小程序搜索某个目录下行有某关键字
 python获取文件扩展名的方法(转)
ceph 池管理
 UVALive 5412 Street Directions
UVALive 3231 Fair Share
UVA 11478 Halum
2015 Multi-University Training Contest 4 hdu 5338 ZZX and Permutations

原文地址：https://www.cnblogs.com/fengwenzhee/p/15510531.html