(第一篇)爬虫技术专栏之requests模块与BeautifulSoup模块
自从大一开始就想在博客专门建立一个分类,深度总结一下爬虫技术相关的东西,但是一直拖到现在。大二马上要结束了,准备从暑假开始认真总结,每一篇文章都争取带一个小案例。给学弟们作参考用~
0x01 requests模块0x001 安装requests是一个作为Python构建的优雅而简单的HTTP库。目前它使用了Apache2 Licensed许可证,requests在Python一些基本库上进行了高度封装。中文文档:http://docs.python-requests.org/zh_CN/latest/
pip install requests
0x002 常用方法requests.get(url, params=None, **kwargs)
# 发送一个get请求到服务器端# url接收一个URL地址# parmas接收一个字典对象# 返回一个请求对象
requests.options(url, **kwargs)
# 发送一个options请求到服务器端# url接收一个URL地址
requests.head(url, **kwargs)
# 发送一个head请求到服务器端# url接收一个URL地址
requests.post(url, data=None, json=None, **kwargs)
# 发送一个post请求到服务器端# url接收一个URL地址# data接收一个字典、字节或者是一个文件对象# json接收一个json数据
requests.put(url, data=None, **kwargs)
# 发送一个put请求到服务器端# url接收一个URL地址# data接收一个字典、字节或者是一个文件对象
requests.patch(url, data=None, **kwargs)
# 发送一个patch请求到服务器端# url接收一个URL地址# data接收一个字典、字节或者是文件对象
requests.delete(url, **kwargs)
# 发送一个delete请求到服务器端# url接收一个URL地址
requests.request(method, url, **kwargs)
# 发送一个请求# method指定请求的方法# url接收一个URL地址# params接收一个字典、字节或者是文件对象# data接收一个使用元组构成的列表[(key, value)]或者是字典、字节或者是文件对象# json接收一个json数据# headers接收一个字典,用于构成请求头# cookies接收一个cookie对象# files接收一个文件对象# auth接收一个元组,用来身份认证# timeout接收一个浮点数或者是元组# allow_redirects接收一个布尔值,默认是True,是否开启重定向# proxies 接收代理的url# verify 是否启用安全认证# stream 是否使用数据流的方式传输文件# cert 使用证书文件,如果是pem文件,则(xxx.pem),如果是crt文件和key文件,则('xxx.crt', 'xxx.key')
0x003 requests.api源码
# -*- coding: utf-8 -*-"""requests.apiThis module implements the Requests API.:copyright: (c) 2012 by Kenneth Reitz.:license: Apache2, see LICENSE for more details."""from . import sessionsdef request(method, url, **kwargs): """Constructs and sends a :class:`Request <Request>`. :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send in the body of the :class:`Request`. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload. ``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')`` or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers to add for the file. :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) How many seconds to wait for the server to send data before giving up, as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple. :type timeout: float or tuple :param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``. :type allow_redirects: bool :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. :param verify: (optional) Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. Defaults to ``True``. :param stream: (optional) if ``False``, the response content will be immediately downloaded. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. :return: :class:`Response <Response>` object :rtype: requests.Response Usage:: >>> import requests >>> req = requests.request('GET', 'https://httpbin.org/get') <Response [200]> """ # By using the 'with' statement we are sure the session is closed, thus we # avoid leaving sockets open which can trigger a ResourceWarning in some # cases, and look like a memory leak in others. with sessions.Session() as session: return session.request(method=method, url=url, **kwargs)def get(url, params=None, **kwargs): r"""Sends a GET request. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', True) return request('get', url, params=params, **kwargs)def options(url, **kwargs): r"""Sends an OPTIONS request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', True) return request('options', url, **kwargs)def head(url, **kwargs): r"""Sends a HEAD request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ kwargs.setdefault('allow_redirects', False) return request('head', url, **kwargs)def post(url, data=None, json=None, **kwargs): r"""Sends a POST request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('post', url, data=data, json=json, **kwargs)def put(url, data=None, **kwargs): r"""Sends a PUT request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('put', url, data=data, **kwargs)def patch(url, data=None, **kwargs): r"""Sends a PATCH request. :param url: URL for the new :class:`Request` object. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('patch', url, data=data, **kwargs)def delete(url, **kwargs): r"""Sends a DELETE request. :param url: URL for the new :class:`Request` object. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response """ return request('delete', url, **kwargs)
0x02 BeautifulSoup模块
0x001安装Debain或UbuntuBeautiful Soup是一个用于从HTML和XML文件中提取数据的Python库。它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间。中文文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
apt-get install Python-bs4
easy_install和pip安装,兼容Python2.x和Python3.x
easy_install beautifulsoup4pip install beautifulsoup4
源码安装
下载地址:https://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/首先解压下载的源码压缩包,进入源码目录,执行:python setup.py install
安装解析器lxml和html5lib
apt-get install Python-lxmleasy_install lxmlpip install lxmlapt-get install Python-html5libeasy_install html5libpip install html5lib
解析器 使用方法 优点 缺点 Python标准库BeautifulSoup(markup, "html.parser")Python的内置标准库,执行速度适中, 文档容错能力强Python 2.7.3 or 3.2.2)前的版本中文档容错能力差lxml HTML 解析器BeautifulSoup(markup, "lxml")速度快,文档容错能力强需要安装C语言库lxml XML 解析器BeautifulSoup(markup, ["lxml", "xml"]) BeautifulSoup(markup, "xml")速度快,唯一支持XML的解析器需要安装C语言库html5libBeautifulSoup(markup, "html5lib")最好的容错性,以浏览器的方式解析文档,生成HTML5格式的文档速度慢,不依赖外部扩展
从上表可知,推荐使用lxml解析器效率更高,但是xml或html文档的格式不正确的话返回的结果可能不正确。
from bs4 import BeautifulSoupsoup = BeautifulSoup(open("index.html")) # 直接打开本地html文件soup = BeautifulSoup("<html>data</html>") #传入html文本
0x002 常用对象介绍
Beautiful Soup将HTML或XML文件转换为树形结构,每个节点都是Python对象。总共可以分为四种:
标签对象
Tag对象与原生的HTML或XML对象相同。tag = soup.b
Name是Tag的名字。tag.name
Attrs, Tag的属性是个列表,可以使用tag[‘class’]的方式操作属性,也可以使用tag.attrs
来操作属性。可遍历的字符串NavigableString对象
由于字符串包含在了Tag内,所以Beautiful Soup用 NavigableString 类来包装tag中的字符串。tag.string:它的类型是BS的字符串,可以通过unicode()
方法将其转换为Unicode字符串。unicode_string = unicode(tag.string)
BS的Tag中包含的字符串不可以被编辑,但是可以通过replace_with()
方法被替换成为其他的字符串。BeautifulSoup对象
该对象表示的全部的内容。其soup.name
属性的值是:u'[document]'
。注释及特殊字符串Comment对象
Comment 对象是一个特殊类型的 NavigableString 对象
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"soup = BeautifulSoup(markup)comment = soup.b.stringtype(comment)# <class 'bs4.element.Comment'>print(soup.b.prettify()) # 输出成为了特殊的格式# <b># <!--Hey, buddy. Want to buy a used parser?--># </b>
Beautiful Soup定义的其他类型可能会出现在XML文档中,CData、ProcessingInstruction、Declaration、Doctype,这些类型与Comment类似,都是NavigableString的子类,只是添加了一些特殊的方法。0x003 常用属性和方法属性及方法名称 释义 soup.head获取<head></head>
soup.title获取<title></title>
soup.TagName获取< TagName></ TagName>soup.find_all(‘TagName’)获取所有TagName的标签tag.contents将tag子节点以列表的方式输出tag.children返回一个tag子节点的可迭代生成器对象tag.descendants属性可以对所有tag的子孙节点进行递归循环tag.string获取tag中的字符串内容tag.strings循环获取tag中的字符串内容tag.stripped_strings功能类似于tag.strings
,但是具有除去多余空白字符串的功能tag.parent获取父节点对象tag.parents获取父节点对象可迭代生成器对象tag.next_sibling获取下一个兄弟节点对象tag.previous_sibling获取上一个兄弟节点对象tag.next_siblings获取向下的所有兄弟节点的可迭代生成器对象tag.previous_siblings获取向上的所有兄弟节点的可迭代生成器对象tag.next_element指向解析过程中下一个被解析的对象tag.previous_element指向解析过程中上一个被解析的对象tag.next_elements指向解析过程中上面所有被解析对象的集合tag.previous_elements指向解析过程中下面被解析对象的集合tag.find_all(‘TagName’)查找所有与TagName匹配的节点tag.find_all([‘TagName1’, ‘TagName2’])查找所有与列表中TagName
相匹配的节点tag.find_all(True)返回所有可以匹配的值tag.find_all(FuncName)接收一个方法名称,如果这个方法返回True表示当前的元素匹配并且找到0x004 官方示例
def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id')soup.find_all(has_class_but_no_id)tag.find_all(Key=’Value) # 搜索所有Key的值是Value的标签soup.find_all(Key=re.compile("RegExp"), Key='Value') # 结合正则表达式使用并且是或的逻辑关系tag.find_all(text=’xxx’) # 使用text参数可以搜索文档中的字符串内容tag.find_all(text=[‘xxx’, ‘xxx’, ]) # text参数可以接受字符串、正则、列表和布尔值tag.find_all(‘TagName’, limit=Number) # 返回Number个符合的标签tag.find_all(‘TagName’, recursive=True/False) # 是否只匹配直接子节点tag.find( name , attrs , recursive , text , **kwargs ) # 直接返回一个结果,匹配不到时返回None,而find_all()返回空列表[]# 类似的方法还有:tag.find_parents()tag.find_parent()tag.find_next_siblings()tag.find_next_sibling()tag.find_previous_siblings()tag.find_previous_sibling()tag.find_all_next()tag.find_next()tag.find_all_previous()tag.find_previous()# Beautiful Soup支持大部分的CSS选择器,即tag.select():tag.append(“Content”) # 向标签中添加内容tag.new_string() # 创建新的字符串对象tag.new_tag() # 创建新的标签对象tag.insert() # 插入标签对象tag.insert_before() # 在tag标签之前插入新的标签对象tag.insert_after() # 在tag标签之后插入新的标签对象tag. clear() # 清除当前tag的内容tag. extract() # 将当前的tag从文档树中删除,并且返回该tag对象tag. decompose() # 从当前的文档树中移除,并且完全销毁该tag对象tag. replace_with() # 替换该tag对象tag. wrap() # 用传入的tag对象包装指定的tag对象tag. unwrap() # 取消使用上层tag对象的包装,并返回被移除的上层tag对象tag. prettify() # 将文档树格式化后使用Unicode编码输出tag. get_text() # 获取tag对象中的内容
0x005 自动登录GitHub
# -*- coding:utf8 -*-import requestsfrom bs4 import BeautifulSoup# 用户名和密码username = 'xxx'password = 'xxx'# 请求头header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'github.com', 'Referer': "https://github.com/xvGe/xvGe.github.io", 'Upgrade-Insecure-Requests': '1', 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",}# 登录response = requests.request('get', 'https://github.com/login', headers=header)soup = BeautifulSoup(response.text, features='lxml')# 获取登录tokentoken = soup.find(name='input', attrs={'name': "authenticity_token"})['value']# 获取cookiecookie = response.cookies.get_dict()# 提交的登录数据formData = { 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token': token, 'login': username, 'password': password,}# 提交登录数据response = requests.request('post', 'https://github.com/session', data=formData, cookies=cookie, headers=header)response.close()
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。