本周需要将爬虫爬下来的数据入库,因为之前已经写好PHP的接口的,可以直接通过python调用PHP接口来实现,所以把方法总结一下。
//python编码问题,因为好久用,所以很容易出现
# -*- coding: utf8 -*-
#!/usr/bin/python
import sys
reload(sys)
sys.setdefaultencoding('utf8')
//python 连接数据库
import MySQLdb
conn = MySQLdb.connect(
host = "localhost",
port = 22,
user = "root",
passwd = "root",
db = "test",
charset = "utf8")
cur = conn.cursor()
sql = 'select title,url,publish_time from mp_articles'
cur.execute(sql)
info = cur.fetchall()
cur.close()
conn.commit()
conn.close()
//python 调用RESTFul 接口
test_data = {'title':title,'srcUrl':srcUrl,'composeTime':composeTime} #参数,以此种字典形式呈现
test_data_urlencode = urllib.urlencode(test_data) #需要注意的是编码问题,通过urllib.urlencode()对将要传入的函数进行编码
requrl = 'http://47.90.20.84/addArticleFromSpider' #这是传入的URL,类似laravel5中的route,需要在laravel 的controller中设置route
req = urllib2.Request(url = requrl,data =test_data_urlencode) 通过Request的方式入数据,好像是默认根据PHP中采用的POST/GET 等方式传入数据
res_data = urllib2.urlopen(req) #对返回的数据进行解析
res = res_data.read() #读取返回的数据
//try...except 当返回的参数有异常是,为了不中断程序的运行,需要用此方式来保证程序运行
try:
test_data = {'title':title,'srcUrl':srcUrl,'composeTime':composeTime}
test_data_urlencode = urllib.urlencode(test_data)
requrl = 'http://47.90.20.84/addArticleFromSpider'
req = urllib2.Request(url = requrl,data =test_data_urlencode)
res_data = urllib2.urlopen(req)
res = res_data.read()
print "addArticleFromSpider():" + res
except urllib2.HTTPError:
print "there is an error"
pass #跳过错误,不进行处理,直接继续执行
完整代码如下:
# -*- coding: <utf8> -*-
#!/usr/bin/python
import MySQLdb
import datetime
import time
import urllib
import urllib2
import json
import sys
reload(sys)
sys.setdefaultencoding('utf8')
conn = MySQLdb.connect(
host = "localhost",
port = 22,
user = "",
passwd = "",
db = "",
charset = "utf8")
cur = conn.cursor()
sql = 'select title,url,publish_time from mp_articles'
cur.execute(sql)
info = cur.fetchall()
#print len(info)
for row in info:
#print len(row)
title = row[0]
srcUrl = row[1]
publish_Time = row[2]
composeTime = time.mktime(publish_Time.timetuple())
composeTime = str(composeTime)
try:
test_data = {'title':title,'srcUrl':srcUrl,'composeTime':composeTime}
test_data_urlencode = urllib.urlencode(test_data)
requrl = 'http://47.90.20.84/addArticleFromSpider'
req = urllib2.Request(url = requrl,data =test_data_urlencode)
res_data = urllib2.urlopen(req)
res = res_data.read()
print "addArticleFromSpider():" + res
except urllib2.HTTPError:
print "there is an error"
pass
cur.close()
conn.commit()
conn.close()