# !/usr/bin/python3.4
# -*- coding: utf-8 -*-
# 百度指数的抓取
# 截图教程:http://www.myexception.cn/web/2040513.html
#
# 登陆百度地址:https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F
# 百度指数地址:http://index.baidu.com
import time
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from PIL import Image
import pytesseract
from aip import AipOcr
import os
import re
import threading
from queue import Queue
import json
import codecs
# 打开浏览器
def openbrowser():
global browser
# https://passport.baidu.com/v2/?login
url = 'http://index.baidu.com/?tpl=trend&word=%D5%BD%C0%C7'
# 打开谷歌浏览器
# Firefox()
# Chrome()
browser = webdriver.Chrome()
# 输入网址
browser.get(url)
# 打开浏览器时间
# print("等待10秒打开浏览器...")
# time.sleep(10)
# 找到id="TANGRAM__PSP_3__userName"的对话框
# 清空输入框
# TANGRAM__PSP_4__userName
# TANGRAM__PSP_4__password
browser.find_element_by_id("TANGRAM__PSP_4__userName").clear()
browser.find_element_by_id("TANGRAM__PSP_4__password").clear()
# 输入账号密码
# 输入账号密码
account = []
try:
fileaccount = open("../baidu/account.txt", encoding='UTF-8')
accounts = fileaccount.readlines()
for acc in accounts:
account.append(acc.strip())
fileaccount.close()
except Exception as err:
print(err)
input("请正确在account.txt里面写入账号密码")
exit()
browser.find_element_by_id("TANGRAM__PSP_4__userName").send_keys(account[0])
browser.find_element_by_id("TANGRAM__PSP_4__password").send_keys(account[1])
# 点击登陆登陆
# id="TANGRAM__PSP_3__submit"
browser.find_element_by_id("TANGRAM__PSP_4__submit").click()
# 等待登陆10秒
# print('等待登陆10秒...')
# time.sleep(10)
print("等待网址加载完毕...")
# select = input("请观察浏览器网站是否已经登陆(y/n):")
select = 'y'
while 1:
if select == "y" or select == "Y":
print("登陆成功!")
print("准备打开新的窗口...")
# time.sleep(1)
# browser.quit()
break
elif select == "n" or select == "N":
selectno = input("账号密码错误请按0,验证码出现请按1...")
# 账号密码错误则重新输入
if selectno == "0":
# 找到id="TANGRAM__PSP_3__userName"的对话框
# 清空输入框
browser.find_element_by_id("TANGRAM__PSP_4__userName").clear()
browser.find_element_by_id("TANGRAM__PSP_4__password").clear()
# 输入账号密码
account = []
try:
fileaccount = open("../baidu/account.txt", encoding='UTF-8')
accounts = fileaccount.readlines()
for acc in accounts:
account.append(acc.strip())
fileaccount.close()
except Exception as err:
print(err)
input("请正确在account.txt里面写入账号密码")
exit()
browser.find_element_by_id("TANGRAM__PSP_4__userName").send_keys(account[0])
browser.find_element_by_id("TANGRAM__PSP_4__password").send_keys(account[1])
# 点击登陆sign in
# id="TANGRAM__PSP_3__submit"
browser.find_element_by_id("TANGRAM__PSP_4__submit").click()
elif selectno == "1":
# 验证码的id为id="ap_captcha_guess"的对话框
input("请在浏览器中输入验证码并登陆...")
select = input("请观察浏览器网站是否已经登陆(y/n):")
else:
print("请输入“y”或者“n”!")
select = input("请观察浏览器网站是否已经登陆(y/n):")
def getindex(keyword, day):
try:
openbrowser()
time.sleep(3)
# try:
# 这里开始进入百度指数
# 要不这里就不要关闭了,新打开一个窗口
# http://blog.csdn.net/DongGeGe214/article/details/52169761
# 新开一个窗口,通过执行js来新开一个窗口
js = 'window.open("http://index.baidu.com");'
browser.execute_script(js)
# 新窗口句柄切换,进入百度指数
# 获得当前打开所有窗口的句柄handles
# handles为一个数组
handles = browser.window_handles
# print(handles)
# 切换到当前最新打开的窗口
browser.switch_to_window(handles[-1])
# 在新窗口里面输入网址百度指数
# 清空输入框
# time.sleep(1)
browser.find_element_by_id("search-input-word").clear()
# time.sleep(1)
# 写入需要搜索的百度指数
browser.find_element_by_id("search-input-word").send_keys(keyword)
# 点击搜索
# <input type="submit" value="" id="searchWords" onclick="searchDemoWords()">
browser.find_element_by_id("searchWords").click()
# time.sleep(2)
# 最大化窗口
browser.maximize_window()
time.sleep(5)
# 构造天数
sel = '//a[@rel="' + str(day) + '"]'
browser.find_element_by_xpath(sel).click()
# 太快了
time.sleep(2)
# 滑动思路:http://blog.sina.com.cn/s/blog_620987bf0102v2r8.html
# 滑动思路:http://blog.csdn.net/zhouxuan623/article/details/39338511
# 向上移动鼠标80个像素,水平方向不同
# ActionChains(browser).move_by_offset(0,-80).perform()
# <div id="trend" class="R_paper" style="height:480px;_background-color:#fff;"><svg height="460" version="1.1" width="954" xmlns="http://www.w3.org/2000/svg" style="overflow: hidden; position: relative; left: -0.5px;">
# <rect x="20" y="130" width="914" height="207.66666666666666" r="0" rx="0" ry="0" fill="#ff0000" stroke="none" opacity="0" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); opacity: 0;"></rect>
# xoyelement = browser.find_element_by_xpath('//rect[@stroke="none"]')
# xoyelement = browser.find_elements_by_css_selector("#trend rect")[2]
xoyelement = browser.find_element_by_xpath('//div[@id="trend"]')
print('xoyelement:%s' % xoyelement)
num = 0
# 获得坐标长宽
# x = xoyelement.location['x']
# y = xoyelement.location['y']
# width = xoyelement.size['width']
# height = xoyelement.size['height']
# print(x,y,width,height)
# 常用js:http://www.cnblogs.com/hjhsysu/p/5735339.html
# 搜索词:selenium JavaScript模拟鼠标悬浮
x_0 = 30
y_0 = 248
if day == "all":
day = 1000000
# 储存数字的数组
index = []
try:
# webdriver.ActionChains(driver).move_to_element().click().perform()
# 只有移动位置xoyelement[2]是准确的
for i in range(day):
# 坐标偏移量???
ActionChains(browser).move_to_element_with_offset(xoyelement, x_0, y_0).perform()
# ActionChains(browser).move_to_element(xoyelement, x_0, y_0).perform()
time.sleep(2)
# 构造规则
if day == 7:
x_0 = x_0 + 202
elif day == 30:
# x_0 = x_0 + 41.68
x_0 = x_0 + 40
elif day == 90:
x_0 = x_0 + 13.64
elif day == 180:
x_0 = x_0 + 6.78
elif day == 1000000:
x_0 = x_0 + 3.37222222
# <div class="imgtxt" style="margin-left:-117px;"></div>
imgelement = browser.find_element_by_xpath('//div[@id="viewbox"]')
imgelement_index = browser.find_element_by_xpath('//div[@id="trend"]')
# imgelement = browser.find_element_by_xpath('//div[@id="trend"]')
print('imgelement:%s' % imgelement)
# 找到图片坐标
locations = imgelement.location
locations_index = imgelement_index.location
print('locations_index:%s' % locations_index)
print('locations:%s' % locations)
time.sleep(1)
# 跨浏览器兼容
scroll = browser.execute_script("return window.scrollY;")
print('scroll:%s' % scroll)
top = locations['y'] - scroll
# 找到图片大小
sizes = imgelement.size
print('sizes:%s' % sizes)
# 构造关键词长度
add_length = (len(keyword) - 2) * sizes['width'] / 15
# 构造指数的位置
rangle = (
int(locations['x'] + sizes['width'] / 4 + add_length), int(top + sizes['height'] / 2),
int(locations['x'] + sizes['width'] * 2 / 3), int(top + sizes['height']))
# 构造指数图片位置
rangle_index = (
int(locations_index['x']), int(locations_index['y']),
int(int(locations_index['x']) + 1256), int(int(locations_index['y']) + 365))
# 截取当前浏览器
path = "../baidu/" + str(num)
browser.save_screenshot(str(path) + ".png")
# 打开截图切割
img = Image.open(str(path) + ".png")
jpg = img.crop(rangle)
jpg.save(str(path) + ".jpg")
jpg_index = img.crop(rangle_index)
jpg_index.save(str(path) + str(time.strftime('%Y-%m-%d', time.localtime(time.time()))) + ".jpg")
# 将图片放大一倍
# 原图大小73.29
jpgzoom = Image.open(str(path) + ".jpg")
(x, y) = jpgzoom.size
x_s = x * 2
y_s = y * 3
out = jpgzoom.resize((x_s, y_s), Image.ANTIALIAS)
out.save(path + 'zoom.jpg', 'png', quality=95)
# 图像识别
try:
# 调用goggle图像识别
image = Image.open(str(path) + "zoom.jpg")
code = pytesseract.image_to_string(image)
# # 调用百度识别接口
# """ 你的 APPID AK SK """
# APP_ID = '10913815'
# API_KEY = 'oLZyPe7PwKzeWhHw7d81oLzy'
# SECRET_KEY = 'KK5BotVGvbsgDI8Sy29ir5gVwEdXB5oM'
#
# client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
#
# """ 读取图片 """
#
# def get_file_content(filePath):
# with open(filePath, 'rb') as fp:
# return fp.read()
#
# # image = get_file_content('29zoom.jpg')
# print('str(path) + "zoom.jpg":%s'%(str(path) + "zoom.jpg"))
# image = get_file_content(str(path) + "zoom.jpg")
#
# """ 调用通用文字识别, 图片参数为本地图片 """
# data = client.basicGeneral(image);
# code = data['words_result'][0]['words']
#
# print(data['words_result'][0]['words'])
# 创建储存字段
data_list = {}
data_list['keyword'] = keyword
data_list['current_time'] = int(time.time()) - 86400
if code:
# index.append(code)
print('type(code):%s' % type(code))
# 去掉非字母和数字
data_initial = re.findall(r"[w]+", code)
data = ''.join(data_initial)
print(data)
# 数据中的字母提前出来
letters = re.findall(r"[a-zA-Z]+", data)
print(letters)
print(type(letters))
num = 0
# 字母用数值替换
if letters:
for letter in letters:
num += 1
print(num)
if letter == 'T' or 't':
data = data.replace(letter, '7')
elif letter == 'B' or 'b':
data = data.replace(letter, '8')
else:
data = data.replace(letter, '这个数据匹配有问题')
data_list['index'] = data
print(data_list)
# 数据保存json格式
data_json = json.dumps(data_list, ensure_ascii=False) + ",
"
print('*' * 100)
print('data_json:%s' % data_json)
print('*' * 100)
print(type(data_json))
with open("index.json", "ab") as fi:
fi.write(data_json.encode('utf-8'))
# data_json = json.dump(data_list, ensure_ascii=False)
# print(type(data_json))
# with open("index.json", "wb", encoding='UTF-8') as f:
# f.write(data_json.encode("utf-8") + "
")
# # json.dump(data_list, codecs.open('data.json', 'a', 'utf-8'), ensure_ascii=False, indent=2)
print(data)
index.append(data)
else:
index.append("数据获取没有成功")
data_list['index'] = '数据获取没有成功'
print(data_list)
data_json = json.dumps(data_list, ensure_ascii=False) + ",
"
print(type(data_json))
with open("error.json", "ab") as f:
f.write(data_json.encode('utf-8'))
except Exception as f:
index.append("数据获取没有成功")
data_list['index'] = '数据获取没有成功'
print(data_list)
data_json = json.dumps(data_list, ensure_ascii=False) + ",
"
print(type(data_json))
with open("error.json", "ab") as f:
f.write(data_json.encode('utf-8'))
print('*' * 20)
print(f)
finally:
num = num + 1
os.remove(str(path) + "zoom.jpg")
os.remove(str(path) + ".jpg")
os.remove(str(path) + ".png")
os.remove(str(path) + "index.jpg")
except Exception as err:
print('err:%s' % err)
print('num:%s' % num)
finally:
print(index)
browser.quit()
# 日期也是可以图像识别下来的
# 只是要构造rangle就行,但是我就是懒
file = open("../baidu/index.txt", "w")
for item in index:
file.write(str(item) + "
")
file.close()
time.sleep(2)
except Exception as err:
print(err)
browser.quit()
if __name__ == "__main__":
# 每个字大约占横坐标12.5这样
# 按照字节可自行更改切割横坐标的大小rangle
# keyword = input("请输入查询关键字:")
# sel = int(input("查询7天请按0,30天请按1,90天请按2,半年请按3,全部请按4:"))
# day = 0
# if sel == 0:
# day = 7
# elif sel == 1:
# day = 30
# elif sel == 2:
# day = 90
# elif sel == 3:
# day = 180
# elif sel == 4:
# day = "all"
# 读取文件的数据
file = open('TVB电视剧名.txt', 'r', encoding='UTF-8')
line = file.readlines()
# line = ['战狼', '小雨', '大雨']
# # 取出文件并去掉换行符号
thread_list = []
for i in line:
data_initial = re.findall(r"[u4e00-u9fa5,0-9,a-z,A-Z]+", i)
keyword_data = ''.join(data_initial)
getindex(keyword_data, 7)
# # data_initial = re.findall(r"[w]+", i)
# # data = ''.join(data_initial)
# thread = threading.Thread(target=getindex, args=[keyword, 30])
# # 执行线程
# thread.start()
# # 将所有线程存到列表里
# thread_list.append(thread)
# time.sleep(10)
#
# for thread in thread_list:
# # 让父线程等待所有的子线程结束,自己再结束
# thread.join()
#
# # print(a)
# # print(type(line))
# #
# # print(line)
#
# # file.close()
#
# # getindex('战狼', 30)