爬取某网站写的python代码
代码如下:
importrequestsfrompyqueryimportPyQueryimportreimportosimportcsvimportdatetime"""说明:该代码是专门为爬取http://www.kgtmall.com.cn/商品而设计的。使用方法:1、在本地提前安装好python3的环境;2、直接运行本代码;3、运行本代码完后,会在当前目录生成一个result.csv文件,该文件里面就存了爬取该站点的商品信息注意事项:在本代码运行期间,不能打开result.csv文件,因为这样程序就写不进去数据了;只能等本代码全部运行结束后,才能打开esult.csv文件进行查看。"""defget_html_text(url):"""获取首页源代码:paramurl::return:"""r=requests.get(url)returnr.textdefget_one_level_class(home_url):"""一级标题母婴用品http://www.kgtmall.com.cn/mall/list.php?catid=4生活家居http://www.kgtmall.com.cn/mall/list.php?catid=5"""html=get_html_text(home_url)jpy=PyQuery(html)items=jpy('.menu_titlea')forlineinitems:jpy=PyQuery(line)one_level_url=jpy('a').attr('href')one_level_title=jpy('a').text()yieldone_level_url,one_level_titledefget_two_level_class(home_url):"""二级标题母婴用品营养辅食http://www.kgtmall.com.cn/mall/search.php?catid=539母婴用品妈妈专区http://www.kgtmall.com.cn/mall/search.php?catid=544母婴用品婴儿保健http://www.kgtmall.com.cn/mall/search.php?catid=887"""forone_level_url,one_level_titleinget_one_level_class(home_url):jpy=PyQuery(one_level_url)items=jpy('.selector_categoryli')forlineinitems:jpy=PyQuery(line)two_level_url=jpy('a').attr('href')two_level_title=jpy('a').text()yieldone_level_title,two_level_title,two_level_urldefget_pages(url):"""获取页数:return:"""jpy=PyQuery(url)pages=jpy('.paginationcite').text()print('原pages:',pages)try:pages=int(re.findall('共.*?条/(.*)页',pages)[0])exceptExceptionase:print(e)pages=1print('页码:',pages)returnpagesdefget_three_level_class(home_url):"""三级标题母婴用品营养辅食DHAhttp://www.kgtmall.com.cn/mall/search.php?catid=548母婴用品营养辅食益生菌/初乳http://www.kgtmall.com.cn/mall/search.php?catid=549母婴用品营养辅食清火/开胃/驱虫http://www.kgtmall.com.cn/mall/search.php?catid=550"""forone_level_title,two_level_title,two_level_urlinget_two_level_class(home_url):jpy=PyQuery(two_level_url)items=jpy('.selector_categoryli')forlineinitems:jpy=PyQuery(line)three_level_title=jpy('a').text()three_level_url=jpy('a').attr('href')catid=re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)',three_level_url)[0]pages=get_pages(three_level_url)#forindexinrange(1,3):forindexinrange(1,pages+1):three_level_url_by_xiaoliang='http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format(catid,index)yieldone_level_title,two_level_title,three_level_title,three_level_url_by_xiaoliangdefshop_title_and_url(home_url):"""商品标题和url母婴用品营养辅食DHA澳洲直邮澳大利亚RIFOLD儿童DHA90粒(一月以上适用)http://www.kgtmall.com.cn/mall/show.php?itemid=28089母婴用品营养辅食益生菌/初乳澳大利亚Maxigenes美可卓全脂高钙奶粉(蓝胖子)1kg两罐装http://www.kgtmall.com.cn/mall/show.php?itemid=23486"""forone_level_title,two_level_title,three_level_title,three_level_url_by_xiaolianginget_three_level_class(home_url):jpy=PyQuery(three_level_url_by_xiaoliang)items=jpy('.list_imga')forlineinitems:jpy=PyQuery(line)shop_url=jpy('a').attr('href')shop_title=jpy('aimg').attr('alt')yieldone_level_title,two_level_title,three_level_title,shop_title,shop_urldefget_shop_info(home_url,count):forone_level_title,two_level_title,three_level_title,shop_title,shop_urlinshop_title_and_url(home_url):print('--排错:'+one_level_title,two_level_title,three_level_title,shop_title,shop_url)jpy=PyQuery(shop_url)price=jpy('.price').text()#条形码bar_code=jpy('.bar_codedlddp').text()goods_detail=jpy('#content')try:guige=re.findall('规格:(.*)',goods_detail.text())[0]except:guige='没有规格'try:chandi=re.findall('产地:(.*)',goods_detail.text())[0]except:chandi='没有产地'print(count,one_level_title,two_level_title,three_level_title,shop_title,bar_code,chandi,guige,price,shop_url)row=([one_level_title,two_level_title,three_level_title,shop_title,bar_code,chandi,guige,price,shop_url])ppath=os.path.dirname(__file__)csv_file=ppath+'/result.csv'#newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错withopen(csv_file,'a',newline='',encoding='utf-8')asf:writer=csv.writer(f)writer.writerow(row)count+=1defmain():#记录一下开始时间start_time=datetime.datetime.now()home_url='http://www.kgtmall.com.cn/'#当前代码路径ppath=os.path.dirname(__file__)csv_file=ppath+'/result.csv'headers=(['一级分类','二级分类','三级分类','商品名称','条码','产地','规格','价格','商品链接'])#newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错withopen(csv_file,'w',newline='',encoding='utf-8')asf:writer=csv.writer(f)writer.writerow(headers)count=1get_shop_info(home_url,1)#记录一下结束时间end_time=datetime.datetime.now()#记录程序执行用时timediff=end_time-start_timeprint('总共用时{}秒\n'.format(str(timediff.seconds)))print('全部商品已经按需求完成!!!')if__name__=='__main__':main()
运行后,会在当前目录下生成个result.csv文件,内容如下:
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。