转载请注明出处:
http://www.cnblogs.com/darkknightzh/p/5715305.html
pubFig数据库网址:
http://www.cs.columbia.edu/CAVE/databases/pubfig/
由于版权的原因,该数据库未提供图片,只提供了图片的链接,并且某些链接已经失效。
说明:1. 某些网址需要跨越绝境长城,因而最好开代理
2. dev_urls.txt和eval_urls.txt均可在官网下载。
3. python新手,因而程序写的不好看,并且还有问题。。。
问题1:文件不存在,这个没法避免。
问题2:有时候链接某个url时,时间很长,之后会抛出异常,并提示类似下面的信息:
HTTPConnectionPool(host='www.stardepot.ca', port=80): Max retries exceeded with url: /img/Miley_Cyrus_27.jpg (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x02AAC3B0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
暂时不知道怎么解决。
1 __author__ = 'XXX' 2 3 import os 4 import numpy as np 5 import urllib 6 import re # regular expression libiary 7 import requests 8 import time 9 10 def findAllStrLoc(inStr, findStr): 11 loc = [] 12 start = 0 13 while True: 14 curLoc = inStr.find(findStr, start) 15 if curLoc == -1: # if search string not found, find() returns -1 16 break # search is complete, break out of the while loop 17 start = curLoc + 1 # move to next possible start position 18 loc.append(curLoc) 19 return loc 20 21 def loadData(dataPath, startLine): 22 datas = [] 23 f = open(dataPath, 'r') # with open(dataPath, 'r') as f: 24 for line in f.readlines()[startLine:]: 25 # data = line.strip().split() 26 loc = findAllStrLoc(line, ' ') 27 data = [] 28 data.append(line[0:(loc[0])]) # person # the end index of the sub str is excluded 29 data.append(line[loc[0]+1:loc[1]]) # imagenum 30 data.append(line[loc[1]+1:loc[2]]) # url 31 rect = line[loc[2]+1:loc[3]] # rect 32 rectLoc = re.findall(r'd+', rect) 33 for ind in range(len(rectLoc)): 34 data.append(rectLoc[ind]) 35 data.append(line[loc[3]+1:len(line)-1]) # md5sum 36 datas.append(data) 37 f.close() 38 return np.array(datas) # datas 39 40 def createimgfolder(imgFolder): 41 if not os.path.isdir(imgFolder): 42 os.makedirs(imgFolder) 43 44 def getImgNameFromURL(url): 45 loc = findAllStrLoc(url, '/') 46 imgName = url[loc[len(loc)-1]+1:] 47 txtName = imgName.split('.')[0] + '.txt' 48 return (imgName, txtName) 49 50 def exists(path): 51 r = requests.head(path) 52 return r.status_code == requests.codes.ok 53 54 def main(): 55 print('loading data') 56 imgInfo = loadData('D:/dev_urls.txt', 2) 57 print('finish loading data ') 58 59 databaseFolder = 'D:/pubFig' 60 createimgfolder(databaseFolder) 61 62 for i in range(9526, len(imgInfo)): 63 curtime = time.strftime('%y%m%d-%H%M%S',time.localtime()) 64 imgFolder = databaseFolder + '/' + imgInfo[i][0] 65 createimgfolder(imgFolder) 66 url = imgInfo[i][2] 67 (imgName, txtName) = getImgNameFromURL(url) 68 try: 69 if exists(url): 70 page = urllib.urlopen(url) 71 img = page.read() 72 page.close() 73 imgPath = imgFolder + '/' + imgName 74 f = open(imgPath, "wb") 75 f.write(img) 76 f.close() 77 78 txtPath = imgFolder + '/' + txtName 79 f = open(txtPath, "w") 80 for j in range(4): 81 f.write(imgInfo[i][j+3] + ' ') 82 f.close() 83 print('%s:%d/%d %s finish'%(curtime, i+1, len(imgInfo), url)) 84 else: 85 print('%s:%d/%d %s does not exist'%(curtime, i+1, len(imgInfo), url)) 86 except (Exception) as e: 87 print('%s:%d/%d %s exception %s'%(curtime, i+1, len(imgInfo), url, e)) 88 89 print('finish') 90 91 if __name__ == '__main__': 92 main()