# 只爬2页 ,练习下
import logging
import requests
import bs4
import json
import psycopg2
from io import StringIO
from urllib import parse
# from proxy_ip import proxyip
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s- %(message)s')
blist = []
def main(page):
url = 'https://www.92qb.com/book/allvote/0/'+str(page)
logging.info(url)
html = request_douban(url)
soup = bs4.BeautifulSoup(html,'lxml')
#logging.debug(soup)
save_to_txt(soup)
def request_douban(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.146 Safari/537.36',
}
proxyooo = {'https':'182.84.144.12:3256'}
print(proxyooo)
logging.info('proxyip ' + str(proxyooo))
try:
target_response = requests.get(url,headers=headers)
if target_response.status_code == 200:
# utf-8编码
target_response.encoding = 'gbk'
# 获取网页信息
target_html = target_response.text
#print(target_html)
return target_html
except requests.RequestException:
return None
def save_to_txt(soup):
booklist = soup.find(class_="clearfix rec_rullist").find_all("ul")
logging.debug('booklist ' + str(booklist))
for i in booklist:
try:
shuming = i.find(class_="two").string
zuozhe = i.find(class_="four").string
leixing = i.find(class_="sev").string
zhishu = i.find(class_="five").string
wanchengriqi = i.find(class_="six").string
dizhi = i.find(class_="two").find( 'a', href=True)['href']
#print(shuming, zuozhe, leixing, dizhi, zhishu, wanchengriqi)
tap = (shuming, zuozhe, leixing, dizhi, zhishu, wanchengriqi)
blist.append(' '.join('%s' % idd for idd in tap))
logging.info(blist)
# writedatabase(list)
except:
return None
def writedatabase(blist):
with open("config/kafka_and_postgres_config.json") as json_file:
config = json.load(json_file)
Conf = json.loads(json.dumps(config))
pghost = Conf['postgres_host']
pguser = Conf['postgres_user']
pgpassword = Conf['postgres_password']
pgdatabase = Conf['postgres_database']
pgtable = Conf['postgres_table']
s = ""
for action in blist:
s += action + '
'
logging.info(s)
conn = psycopg2.connect(host=pghost, user=pguser, password=pgpassword, database=pgdatabase)
cur = conn.cursor()
cur.copy_from(StringIO(s), pgtable, columns=('shuming', 'zuozhe', 'leixing', 'zhuangtai', 'dizhi'))
conn.commit()
cur.close()
conn.close()
list = []
print('完成')
if __name__=="__main__":
for i in range(1,3):
main(i)