需要用python写个脚本出来提取img标签和src的内容,在存数据的时候,搞藤了很久,原因是把list类型的数据直接放入sql语句里面了,一直报下面这个错误
脑子抽了,以为是src里面转义字符的问题,就一直往这个方向整
后面才发现,是直接把list类型放sql format里面了,然后将[]一起转成了字符串 如 '[' http://www.baidu.com ']'
执行的时候将'['作为了一个字符,后面的http.......就肯定识别不到了嘛,,哎呀,,,笨
解决:将list的元素插入sql 占位符对应位置,而不是将 imgSrc直接放img占位的地方
源码如下:
# coding=utf-8 import pymssql import re def connectDB(): conn = pymssql.connect(server='****', user='User', password='****', database='*****', charset='cp936') cur = conn.cursor() sql = 'select ProductID,Content from Products WHERE (not Content IS NULL )' cur.execute(sql) row = cur.fetchone() resultList = [] while row: # print("ProductID=%s,Content=%s" % (row[0], row[1])) result = parseContent(row[1]) if result: tmp = [] # print("解析出的img为:") # print(result) # tmp.append(int(row[0])) # 将productID转成int类型,方便下面的比较 tmp.append(row[0]) tmp.append(result) resultList.append(tmp) try: row = cur.fetchone() except UnicodeDecodeError: continue conn.close() return resultList def parseContent(content): pattern = '<img[^>]*/>' result = re.findall(pattern, content) return result def saveImg(resultList): productIdList = getExtraBookProductIDList() conn = pymssql.connect(server='****', user='User', password='****', database='*****', charset='cp936') cur = conn.cursor() for result in resultList: # 遍历解析出来的imgList if result[0] in productIdList: # 提取src imgSrc = getImgSrc(result[1]) for img in imgSrc: sql_1 = """update ExtraBookInfo set YImage='{img}' WHERE ProductID='{pID}'""".format( img=img, pID=result[0]) print(sql_1) cur.execute(sql_1) conn.commit() else: # sql_2 = """insert into ExtraBookInfo (ProductID,YImage) values( '{pID}','{img}')""".format( # pID=result[0], img=tmp) for img in imgSrc: cur.execute('insert into ExtraBookInfo ProductID,YImage values(%s,%s)', (result[0], img)) conn.commit() conn.close() def getExtraBookProductIDList(): conn = pymssql.connect(server='****', user='User', password='****', database='*****', charset='cp936') cur = conn.cursor() sql = 'select ProductID from ExtraBookInfo' cur.execute(sql) productIdList = [] row = cur.fetchone() while row: productIdList.append(row[0]) try: row = cur.fetchone() except UnicodeDecodeError: continue conn.close() return productIdList def getImgSrc(result): for r in result: pattern_2 = 'http.*?.jpg' p2 = re.findall(pattern_2, r) print(p2) return p2 resultList = connectDB() saveImg(resultList)
*********
*******
不要轴。。。。。。。。