TXT
from pyquery import PyQuery
import re
import json
import requests
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.text
except:
print('status_code is not 200')
return None
def parse_time(str):
txt = re.search('d{4}(-d{2}-d{2})*', str)
return txt.group()
def parse_html(html, f):
doc = PyQuery(html)
dd_nodes = doc('dl.board-wrapper')
ranks = dd_nodes('.board-index').items()
names = dd_nodes('.name').items()
actors = dd_nodes('.star').items()
times = dd_nodes('.releasetime').items()
integers = dd_nodes('.integer').items()
fractions = dd_nodes('.fraction').items()
for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
str = '
'.join([rank.text(), name.text(), actor.text().replace('主演:', ''), parse_time(ts.text()), integer.text() + fraction.text()])
if __name__ == '__main__':
url = 'http://maoyan.com/board/4'
with open('movie.txt', 'w') as f:
for i in range(10):
path = url + '?offset=' + str(i*10)
print(path)
html = get_html(path)
if html:
parse_html(html, f)
JSON
json.loads(str)把字符串转为JSON对象
json.dumps(JSON, indent=2, ensure_ascii=False)把JSON对象转换为字符串
indent=2设置格式,2代表缩进字符数
ensure_ascii=False解决乱码
from pyquery import PyQuery
import re
import json
import requests
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.text
except:
print('status_code is not 200')
return None
def parse_time(str):
txt = re.search('d{4}(-d{2}-d{2})*', str)
return txt.group()
def parse_html(html, f):
doc = PyQuery(html)
dd_nodes = doc('dl.board-wrapper')
ranks = dd_nodes('.board-index').items()
names = dd_nodes('.name').items()
actors = dd_nodes('.star').items()
times = dd_nodes('.releasetime').items()
integers = dd_nodes('.integer').items()
fractions = dd_nodes('.fraction').items()
for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
data = {
'rank': rank.text(),
'name': name.text(),
'actor': actor.text().replace('主演:', ''),
'time': parse_time(ts.text()),
'score': integer.text() + fraction.text()
}
f.write(json.dumps(data, indent=2, ensure_ascii=False))
if __name__ == '__main__':
url = 'http://maoyan.com/board/4'
with open('movie_json.txt', 'w') as f:
for i in range(10):
path = url + '?offset=' + str(i*10)
print(path)
html = get_html(path)
if html:
parse_html(html, f)
CSV
import csv
with open('data.csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',') # delimiter默认:,
writer.writerow(['id', 'name', 'age'])
writer.writerow(['1001', 'mike', 20])
writer.writerow(['1002', 'bob', 22])
writer.writerow(['1003', 'jordan', 21])
# writer.writerows([['1001', 'mike', 20], ['1002', 'bob', 22], ['1003', 'jordan', 21]]) #写入多行
CSV存入字典类型数据
import csv
with open('data.csv', 'w') as csvfile:
fieldnames = ['id', 'name', 'age']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # fieldnames=fieldnames设置title
writer.writeheader()
writer.writerow({'id': 1001, 'name': 'mike', 'age': 20})
writer.writerow({'id': 1002, 'name': 'bob', 'age': 22})
writer.writerow({'id': 1003, 'name': 'char', 'age': 24})
# 追加数据
import csv
with open('data.csv', 'a', encoding='utf-8') as csvfile: # encoding编码
fieldnames = ['id', 'name', 'age']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # fieldnames=fieldnames设置title
writer.writeheader()
writer.writerow({'id': 1001, 'name': '张', 'age': 20})
writer.writerow({'id': 1002, 'name': '李', 'age': 22})
writer.writerow({'id': 1003, 'name': '黄', 'age': 24})
# 读取
import csv
with open('data.csv', 'r', encoding='utf-8') as csvfile: # encoding编码
reader = csv.reader(csvfile)
for row in reader:
print(row)
from pyquery import PyQuery
import csv
import re
import requests
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.text
except:
print('status_code is not 200')
return None
def parse_time(str):
txt = re.search('d{4}(-d{2}-d{2})*', str)
return txt.group()
def parse_html(html):
doc = PyQuery(html)
dd_nodes = doc('dl.board-wrapper')
ranks = dd_nodes('.board-index').items()
names = dd_nodes('.name').items()
actors = dd_nodes('.star').items()
times = dd_nodes('.releasetime').items()
integers = dd_nodes('.integer').items()
fractions = dd_nodes('.fraction').items()
with open('movie.csv', 'a', encoding='utf-8') as csvfile:
fieldnames = ['rank', 'name', 'actor', 'time', 'score']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
data = {
'rank': rank.text(),
'name': name.text(),
'actor': actor.text().replace('主演:', ''),
'time': parse_time(ts.text()),
'score': integer.text() + fraction.text()
}
writer.writerow(data)
if __name__ == '__main__':
url = 'http://maoyan.com/board/4'
with open('movie.csv', 'w', encoding='utf-8') as csvfile:
fieldnames = ['rank', 'name', 'actor', 'time', 'score']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(10):
path = url + '?offset=' + str(i*10)
print(path)
html = get_html(path)
if html:
parse_html(html)