Python如何爬取京东的评价信息

2025-02-10 技术教程

Python如何爬取京东的评价信息模块：requests，BeautifulSoup

import reimport timeimport csvimport requestsfrom bs4 import BeautifulSoupdef write_a_row_in_csv(data, csv_doc): "save good information into a row in csv document" with open(csv_doc, 'a', newline='') as f: writer = csv.writer(f) writer.writerow(data)# add headers, download page, check status code, return pageurl = 'https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BAp20&enc=utf-8&suggest=1.def.0.V13&wq=%E5%8D%8E%E4%B8%BA&pvid=f47b5d05bba84d9dbfabf983575a6875'headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" }response = requests.get(url, headers=headers)print(response.status_code)# save as html documentwith open('html.html', 'w', encoding='utf8') as f: f.write(response.text)# save as csv documentwith open('phone.csv', 'w', newline='') as f: writer = csv.writer(f) fields = ('id', '名称', '价格', '评价人数', '好评率') writer.writerow(fields)# find elements, such as name, item, price, comment, goodrate, comment countsoup_all = BeautifulSoup(response.content, 'lxml')sp_all_items = soup_all.find_all('li', attrs={'class': 'gl-item'})for soup in sp_all_items[:3]: print('-' * 50) name = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('em').text print('name: ', name) item = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('a') print('item: ', item['href'], re.search(r'(\d+)', item['href']).group()) price = soup.find_all('div', attrs={'class': 'p-price'}) print('price:', price[0].i.string) comment = soup.find_all('div', attrs={'class': 'p-commit'}) print('comment url:', comment[0].find('a').attrs['href']) time.sleep(0.2) # need add referer into headers item_id = re.search(r'(\d+)', item['href']).group() url = f'https://sclub.jd.com/comment/productPageComments.action?productId={item_id}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' headers = { "referer": f"https://item.jd.com/{item_id}.html", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0" } response = requests.get(url, headers=headers) with open('html.json', 'w', encoding='utf8') as f: f.write(response.text) data = response.json() comment_count = data['productCommentSummary']['commentCount'] print('评价人数：', comment_count) good_rate = data['productCommentSummary']['goodRate'] print('好评率：', good_rate) # record data into CSV sheet write_a_row_in_csv(('id'+item_id, name, price[0].i.string, comment_count, good_rate), 'phone.csv')