一、效果图
二、源码
''' 测试内容页爬取''' def test_content_url(self): try: url = self.test_url_var.get().strip() items = self.content_tree.get_children('') content = self.get_html(url) content_dict = {} self.test_text.delete(1.0, END) for item in items: value = self.content_tree.item(item).get('values') if value[4] == 0: print(value) if value[5] == 0: # substring return_value = self.deal_with_sustring(content, value[1], value[2]) if value[6]: return_value = self.request_again(url, return_value, value[6]) if value[7]: exec_content = value[7].format(return_value) return_value = self.deal_with_python(exec_content) return_value = self.c content_dict[value[0]] = return_value self.test_text.insert(END, value[0] + ': ' + return_value + ' ') else: # re pattern = re.findall(value[3], content, re.I|re.M) if pattern: pattern_value = pattern[0] else: pattern_value = '' if value[6]: pattern_value = self.request_again(url, pattern_value, value[6]) if value[7]: exec_content = value[7].format(pattern_value) return_value = self.deal_with_python(exec_content) self.test_text.insert(END, value[0] + ': ' + pattern_value + ' ') content_dict[value[0]] = pattern_value else: print('%s在列表页提取' % value[0]) print(content_dict) except Exception as e: print(e) self.test_text.insert(END, '错误信息:' + str(e))
有需要源码的可以评论哦~