一个简单的python爬虫,爬取知乎主要实现 爬取一个收藏夹 里 所有问题答案下的 图片文字信息暂未收录,可自行实现,比图片更简单具体代码里有详细注释,请自行阅读

项目源码:

#-*-coding:utf-8-*-fromspiderimportSpiderHTMLfrommultiprocessingimportPoolimportsys,urllib,http,os,random,re,time__author__='waiting''''使用了第三方的类库BeautifulSoup4,请自行安装需要目录下的spider.py文件运行环境:python3.4,windows7'''#收藏夹的地址url='https://www.zhihu.com/collection/30822111'#page参数改为代码添加#本地存放的路径,不存在会自动创建store_path='E:\\zhihu\收藏夹\\会员才知道的世界'classzhihuCollectionSpider(SpiderHTML):def__init__(self,pageStart,pageEnd,url):self._url=urlself._pageStart=int(pageStart)self._pageEnd=int(pageEnd)+1self.downLimit=0#低于此赞同的答案不收录defstart(self):forpageinrange(self._pageStart,self._pageEnd):#收藏夹的页数url=self._url+'?page='+str(page)content=self.getUrl(url)questionList=content.find_all('div',class_='zm-item')forquestioninquestionList:#收藏夹的每个问题Qtitle=question.find('h3',class_='zm-item-title')ifQtitleisNone:#被和谐了continuequestionStr=Qtitle.a.stringQurl='https://www.zhihu.com'+Qtitle.a['href']#问题题目Qtitle=re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string)#windows文件/目录名不支持的特殊符号try:print('-----正在获取问题:'+Qtitle+'-----')#获取到问题的链接和标题,进入抓取exceptUnicodeEncodeError:print(r'---问题含有特殊字符无法显示---')try:Qcontent=self.getUrl(Qurl)except:print('!!!!获取出错!!!!!')passanswerList=Qcontent.find_all('div',class_='zm-item-answerzm-item-expanded')self._processAnswer(answerList,Qtitle)#处理问题的答案time.sleep(5)def_processAnswer(self,answerList,Qtitle):j=0foranswerinanswerList:j=j+1upvoted=int(answer.find('span',class_='count').string.replace('K','000'))#获得此答案赞同数ifupvoted<self.downLimit:continueauthorInfo=answer.find('div',class_='zm-item-answer-author-info')#获取作者信息author={'introduction':'','link':''}try:author['name']=authorInfo.find('a',class_='author-link').string#获得作者的名字author['introduction']=str(authorInfo.find('span',class_='bio')['title'])#获得作者的简介author['link']=authorInfo.find('a',class_='author-link')['href']exceptAttributeError:author['name']='匿名用户'+str(j)exceptTypeError:#简介为空的情况pass#匿名用户没有链接file_name=os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')ifos.path.exists(file_name):#已经抓取过continueself.saveText(file_name,'{introduction}\r\n{link}'.format(**author))#保存作者的信息print('正在获取用户`{name}`的答案'.format(**author))answerContent=answer.find('div',class_='zm-editable-contentclearfix')ifanswerContentisNone:#被举报的用户没有答案内容continueimgs=answerContent.find_all('img')iflen(imgs)==0:#答案没有上图passelse:self._getImgFromAnswer(imgs,Qtitle,**author)#收录图片def_getImgFromAnswer(self,imgs,Qtitle,**author):i=0forimginimgs:if'inline-image'inimg['class']:#不抓取知乎的小图continuei=i+1imgUrl=img['src']extension=os.path.splitext(imgUrl)[1]path_name=os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)try:self.saveImg(imgUrl,path_name)#捕获各种图片异常,流程不中断except:pass#收录文字def_getTextFromAnswer(self):pass#命令行下运行,例:zhihu.py15获取1到5页的数据if__name__=='__main__':page,limit,paramsNum=1,0,len(sys.argv)ifparamsNum>=3:page,pageEnd=sys.argv[1],sys.argv[2]elifparamsNum==2:page=sys.argv[1]pageEnd=pageelse:page,pageEnd=1,1spider=zhihuCollectionSpider(page,pageEnd,url)spider.start()

很多初学者,对Python的概念都是模糊不清的,Python能做什么,学的时候,该按照什么线路去学习,学完往哪方面发展,想深入了解,详情可以点击有道云笔记链接了解:http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa