一.爬子域名
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re
import sys
def get(domain):
url = 'http://i.links.cn/subdomain/'
# payload = ("domain=ycxy.com&b2=1&b3=1&b4=1")
payload = ("domain={domain}&b2=1&b3=1&b4=1".format(domain=domain))
r = requests.post(url=url,params=payload)
con = r.text
a =re.compile('value="(.+?)"><input') #正则匹配引号里的任何字符,非贪婪
result = a.findall(con)
for i in result:
print i
if __name__ == '__main__':
command =sys.argv[1:] #取所有后面的参数
f ="".join(command) #用空格连接
get(f)
二.爬I春秋精华页标题
#!/usr/bin/python
#coding=GBK
import requests
import re
def gethtml():
url = 'https://bbs.ichunqiu.com/portal.php'
headers = {
'Host': 'bbs.ichunqiu.com',
'Connection': 'close',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
r = requests.get(url=url,headers=headers)
html = r.content
title = re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>', html)
for i in title:
print i
# return html
s =gethtml()
# a =re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>',s)
# for i in a:
# print(i)
三.爬妹子图片
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests,re,sys
import urllib
def getimg():
for x in range(1,298):
url = 'http://www.7160.com/xingganmeinv/list_3_'+str(x)+'.html'
r =requests.get(url=url)
con = r.content
# result = re.findall(r'<span class="bom_z">(.*?)</span></a></li>',con)
tu = re.findall(r'<img src="(.+?)" alt="',con)
# for i in result:
# print i
# for j in tu:
# print j
xx = 0
for n in tu:
tu.append(n)
urllib.urlretrieve(n,'d:/meinv/%s.jpg'%xx)
xx=xx+1
if __name__ == '__main__':
getimg()
三.百度URL采集
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import sys
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",
'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection' : 'keep-alive',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'X-Forwarded-For':'120.239.169.74'
}
def url(key):
for i in range(0,10,10):
bd_search="https://www.baidu.com/s?word=%s=&pn=%s"% (key,str(i))
# bd_search = "https://bbs.ichunqiu.com/thread-40592-1-1.html"
r =requests.get(bd_search,headers=headers,verify=False,timeout=2)
s= r.text
# result = re.findall(r'.t > a',s)
# print s.encode('utf-8')
soup=BeautifulSoup(s,"lxml")
url_list=soup.select(".t > a") #对请求回来的内容进行查找,找出a标签里(URL链接)
# print url_list
for url in url_list:
real_url=url['href'] #遍历循环,并且打印
try:
r=requests.get(real_url,headers=headers,verify=False,timeout=2) #再次请求
print(r.url) #打印出URL链接
print key
except Exception as e:
print(e)
# url('sss')
if __name__ == '__main__':
command = sys.argv[1:]
canshu = "".join(command)#加上参数
url(canshu)