python爬虫
2021-05-17 14:13:14 20 举报
AI智能生成
爬虫思维导图,不定时更新项目文件
作者其他创作
大纲/内容
import requests # 网页下载工具
from bs4 import BeautifulSoup # 分析网页数据的工具
from scrapy.cmdline import excute #调用后可执行scrapy脚本
excute(["scrapy","crawl","jobbole"]) #在scrapy环境下执行scrapy crawl jobbole命令
import sys
sys.path.append #设置工程目录
import re # 正则表达式,用于数据清洗\整理等工作
re.comple(r"/view/\d+\.htm") #
import os # 系统模块,用于创建结果文件保存等
os.path.abspath(__file__) #当前文件路径
os.path.dirname(os.patg.abspath()) #当前文件所在的目录
import codecs # 用于创建结果文件并写入
import urllib.request #等同于python2中的urllib2
urllib.request.urlopen(url) #打开网页
urllib.request.urlopen(url).getcode() #获取网页的状态码
urllib.urljoin(url) #将主url与其他url拼接
parse.urljoin(response.url, url) #scrapy中将url与主url拼接
from bs4 import BeautifulSoup # 分析网页数据的工具
from scrapy.cmdline import excute #调用后可执行scrapy脚本
excute(["scrapy","crawl","jobbole"]) #在scrapy环境下执行scrapy crawl jobbole命令
import sys
sys.path.append #设置工程目录
import re # 正则表达式,用于数据清洗\整理等工作
re.comple(r"/view/\d+\.htm") #
import os # 系统模块,用于创建结果文件保存等
os.path.abspath(__file__) #当前文件路径
os.path.dirname(os.patg.abspath()) #当前文件所在的目录
import codecs # 用于创建结果文件并写入
import urllib.request #等同于python2中的urllib2
urllib.request.urlopen(url) #打开网页
urllib.request.urlopen(url).getcode() #获取网页的状态码
urllib.urljoin(url) #将主url与其他url拼接
parse.urljoin(response.url, url) #scrapy中将url与主url拼接
python模块
写入json文件的逻辑
class ScrapyJsonPipeline():
def __init__(self):
self.file =codecs.open('scrapy_jobbole.json','w',encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def close_spider(self,spider):
self.file.close()
生成md_5值
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/9/6 17:58
# @Author: jecht
# @File : get_md5.py
import hashlib
def get_md5(url):
if isinstance(url,str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__ == "__main__":
print (get_md5("http://jobbole.com"))
设置文件路径
添加当前文件的上上个目录的目录名的路径
path = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
sys.path.insert(0,path)
添加当前文件的目录路径(当时用非绝对路径时)
path1 = os.path.abspath(os.path.dirname(__file__))
添加当前文件的绝对路径
os.path.join(path1,'filename')
list使用filter函数过滤
page_urls = filter(lambda x:True if x.startswith("https") else False, page_urls)
将list中的元素修改成“,”隔开的字符串
raink_item = ",".join(str(i.stip()) for i in self["raink_item"])
date与时间、时间戳的转换
时间戳转换成date
datetime.datetime.fromtimestamp(self["ans_create_time"]).strftime("%Y/%m/%d %H:%M:%S")
str类型时间转换成datetime
datetime.datetime.strptime(date_value, "%Y/%m/%d").date()
如2017/11/11
datetime.datetime.strptime(date_value, "%Y/%m/%d %H:%M:%S")
如2017/11/11 17:00:00
年月日转换成datetime类型
datetime.datetime.strptime(date_value, "%Y年%m月%d日").date()
如2017年11月11日
数据类型转换
str转换成int(带千位分隔符,使用replace替换)
int(get_number("".join(self["comment_num"])).replace(",",""))
tuple类型和list类型转换成·str类型
"".join(list(x))
两个list组成一个字典dict
a = list()
b = list()
c = dict(zip(a,b))
for (x,y) in zip(a,b)
b = list()
c = dict(zip(a,b))
for (x,y) in zip(a,b)
生成随机数字
from random import randint
randint(0,21)
#生成0,到20以内的任意数字
randint(0,21)
#生成0,到20以内的任意数字
爬虫爬取机制结构
url管理器
防止重复抓取(最小功能范围)
1.添加新的url到待爬去集合中 3.获取待爬取url
2.同时判断待添加的url是否在容器中 4.同时判断是否还有待爬取的url
5.将url从待爬取移动到已爬取
1.添加新的url到待爬去集合中 3.获取待爬取url
2.同时判断待添加的url是否在容器中 4.同时判断是否还有待爬取的url
5.将url从待爬取移动到已爬取
实现方式
1.内存: 将待爬取url和已爬取url存放到url集合set()中
2.关系型数据库:mysql:urls(url.is_crawled)is_crawled标志url是否被爬取
3.缓存数据库: redis:将爬取url和已爬取url存放到set中
1.内存: 将待爬取url和已爬取url存放到url集合set()中
2.关系型数据库:mysql:urls(url.is_crawled)is_crawled标志url是否被爬取
3.缓存数据库: redis:将爬取url和已爬取url存放到set中
url下载器
import urllib2
url = "http://www.baidu.com"
第一种方法
response = urllib2.urlopen(url)
print response.gedcode() 访问url后的获取码
print response.read() 访问url的内容
第二种方法(增加url头文件)
request = urllib2.Request(url)
request.add_header("user-agent","Mozilla/5.0")
response2 = urllib2.urlopen(url)
print response2.getcode()
print len(response2.read())
第三种方法(增强cook处理能力)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response3 = urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
url = "http://www.baidu.com"
第一种方法
response = urllib2.urlopen(url)
print response.gedcode() 访问url后的获取码
print response.read() 访问url的内容
第二种方法(增加url头文件)
request = urllib2.Request(url)
request.add_header("user-agent","Mozilla/5.0")
response2 = urllib2.urlopen(url)
print response2.getcode()
print len(response2.read())
第三种方法(增强cook处理能力)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response3 = urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
url网页解析器
Beautiful Soup
beautlful soup --- python第三方库,用于html或xml中提取数据
pip install beautifulsoup4
from bs4 import BeautifulSoup
pip install beautifulsoup4
from bs4 import BeautifulSoup
1. 创建BeutifulSoup对象,并传入三个参数:html的文档字符串、解析器、文档的编码
soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')
soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')
2. 搜索节点(find_all,find)
find:搜索第一个节点
find_all:搜索所有符合的节点
soup.find_all('a')
soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
soup.find_all('div',class_='abc',string='python') 查找href为abc,文字为python的节点
find:搜索第一个节点
find_all:搜索所有符合的节点
soup.find_all('a')
soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
soup.find_all('div',class_='abc',string='python') 查找href为abc,文字为python的节点
访问节点信息
node.name 获取查找到的该节点的名字
node['href'] 获取查找到的a节点的href属性
node.get_text() 获取a节点下的文字
node.name 获取查找到的该节点的名字
node['href'] 获取查找到的a节点的href属性
node.get_text() 获取a节点下的文字
给定一个html_doc文件,获取所有链接信息
html_doc= " "
soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')
links = soup.find_all('a')
for link in links:
print link.name,linke['href'],link.get_text()
html_doc= " "
soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')
links = soup.find_all('a')
for link in links:
print link.name,linke['href'],link.get_text()
session、cookie的区别
cookie
浏览器的本地存储方式,是以dict的形式存储
{"sessionkey":"value"}value在浏览器中为一段文本,浏览器会自动对其解析
{"sessionkey":"value"}value在浏览器中为一段文本,浏览器会自动对其解析
浏览器向服务器发起无状态请求,服务器返回请求数据并返回一个标志id,浏览器将id存储到本地cookie上,
当下次浏览器带着有请求id的报文到服务器上时,服务器就能判断出该浏览器上次请求的内容
当下次浏览器带着有请求id的报文到服务器上时,服务器就能判断出该浏览器上次请求的内容
session
由于cookie不安全性考虑:本地cook不能保存用户名和密码,服务器在返回请求数据和标志id时,
将登录信息保存服务器本地,并把登陆信息形成session_id返回给浏览器,当下次浏览器进行登录时,
将本地的session_id发送给服务器时,服务器通过session_id查找浏览器的登录信息进行匹配登陆
将登录信息保存服务器本地,并把登陆信息形成session_id返回给浏览器,当下次浏览器进行登录时,
将本地的session_id发送给服务器时,服务器通过session_id查找浏览器的登录信息进行匹配登陆
http常见状态码
200 :请求被处理成功
301/302:永久性重定向/临时重定向
403 :没有权限访问
404 :表示没有对应的数据
500 :服务器错误
503 :服务器停机或正在服务
301/302:永久性重定向/临时重定向
403 :没有权限访问
404 :表示没有对应的数据
500 :服务器错误
503 :服务器停机或正在服务
分支主题
scrapy框架
scrapy框架图
1.spider通过yield Request发出一个request请求,通过engine将request发送给scheduler
2.scheduler收到request,通过engine将request通过downloader middleware层层过滤后发送给downloader
3.downloader通过downloader middleware过滤返回一个response,通过spider middleware过滤后engine转发给spiders
4.spiders通过spider middleware过滤后返回items和requests,engine收到后,判断items发给item pipelines进行下载,
requests发送给scheduler重新进入循环步骤2
2.scheduler收到request,通过engine将request通过downloader middleware层层过滤后发送给downloader
3.downloader通过downloader middleware过滤返回一个response,通过spider middleware过滤后engine转发给spiders
4.spiders通过spider middleware过滤后返回items和requests,engine收到后,判断items发给item pipelines进行下载,
requests发送给scheduler重新进入循环步骤2
源码在site-page/tscrapy/core中
request和response的参数
request
url:url
回调函数:callback=None
method='GET'/'POST'
头部信息:headers=None
body=None
cook信息:cookies=None(可以是或list一个dict)
传入下一个函数的参数:meta=None
设置编码:sencoding='utf-8'
影响scheduler的调度优先级:priority=0
判断多个request是否被过滤:dont_filter=False(不被过滤)
错误回调函数:errback=None
回调函数:callback=None
method='GET'/'POST'
头部信息:headers=None
body=None
cook信息:cookies=None(可以是或list一个dict)
传入下一个函数的参数:meta=None
设置编码:sencoding='utf-8'
影响scheduler的调度优先级:priority=0
判断多个request是否被过滤:dont_filter=False(不被过滤)
错误回调函数:errback=None
errback实例:
yield scrapy.Request(url,callback=self.parse_httpbin,errback=self.errback_httpbin,dontfilter=True)
def errback_httpbin(self,failure):
self.logger.error(repr(failure))
if failure.check(HttpError):
response=failure.value.response
self.logger.error('HttpError on %s',response.url)
elif failure.check(DnsLookupError):
response=failure.response
self.logger.error('DnsLookupError on %s',response.url)
elif failure.check(TimeoutError,TCPTimedOutError):
response=failure.response
self.logger.error('TimeoutError on %s',response.url)
yield scrapy.Request(url,callback=self.parse_httpbin,errback=self.errback_httpbin,dontfilter=True)
def errback_httpbin(self,failure):
self.logger.error(repr(failure))
if failure.check(HttpError):
response=failure.value.response
self.logger.error('HttpError on %s',response.url)
elif failure.check(DnsLookupError):
response=failure.response
self.logger.error('DnsLookupError on %s',response.url)
elif failure.check(TimeoutError,TCPTimedOutError):
response=failure.response
self.logger.error('TimeoutError on %s',response.url)
errback实例:
yield scrapy.Request(url,callback=self.parse_httpbin,errback=self.errback_httpbin,dontfilter=True)
def errback_httpbin(self,failure):
self.logger.error(repr(failure))
if failure.check(HttpError):
response=failure.value.response
self.logger.error('HttpError on %s',response.url)
yield scrapy.Request(url,callback=self.parse_httpbin,errback=self.errback_httpbin,dontfilter=True)
def errback_httpbin(self,failure):
self.logger.error(repr(failure))
if failure.check(HttpError):
response=failure.value.response
self.logger.error('HttpError on %s',response.url)
方法:copy()、replace()
查看scrapy.org官方文档
查看scrapy.org官方文档
response
url:url
状态码:status = 200
头部信息:headers=None
页面信息:body=b' '
flags=None
request=None
状态码:status = 200
头部信息:headers=None
页面信息:body=b' '
flags=None
request=None
方法:
copy()
urljoin()
replase()
copy()
urljoin()
replase()
子类:
TextResponse
HtmlResponse
XmlResponse
TextResponse
HtmlResponse
XmlResponse
HtmlResponse:
HtmlResponse继承了TextRensponse
TextRensponse有两个方法xpath,css
HtmlResponse继承了TextRensponse
TextRensponse有两个方法xpath,css
做云图
创建并建立用户
mkdir demo ; cd demo
安装wheel库
pip install wheel
下载并安装词云模块
http://www.lfd.uci.edu/~gohlke/pythonlibs/
pip install wordcloud-1.3.2-cp36-cp36m-win32.whl
安装notebook
pip install jupyter
jupyter notebook
mkdir demo ; cd demo
安装wheel库
pip install wheel
下载并安装词云模块
http://www.lfd.uci.edu/~gohlke/pythonlibs/
pip install wordcloud-1.3.2-cp36-cp36m-win32.whl
安装notebook
pip install jupyter
jupyter notebook
网上爬取的数据生成jay.txt文本
filename = "jay.txt"
with open(filename) as f:
mytext = f.read()
mytext
from wordcloud import WordCloud
wordcloud = WordCloud().generate(mytext)
中文引入分词工具,及中文字体下载到demo目录
中文字体:
https://s3-us-west-2.amazonaws.com/notion-static/b869cb0c7f4e4c909a069eaebbd2b7ad/simsun.ttf
安装结巴:
pip install jieba
import jieba
mytext = " ".join(jieba.cut(mytext))
修改:wordcloud = WordCloud(font_path = "simsun.ttf").generate(mytext)
中文字体:
https://s3-us-west-2.amazonaws.com/notion-static/b869cb0c7f4e4c909a069eaebbd2b7ad/simsun.ttf
安装结巴:
pip install jieba
import jieba
mytext = " ".join(jieba.cut(mytext))
修改:wordcloud = WordCloud(font_path = "simsun.ttf").generate(mytext)
%pylab inline
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
1.python使用beautlfulsoup爬取百度百科网页
spider_main(爬虫主函数)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/8/29 2:33
# @Author: jecht
# @File : spider_main.py
from baidubaike_spider import url_manager, url_download, url_parser, spider_output
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = url_download.UrlDownload()
self.parser = url_parser.UrlParser()
self.output = spider_output.SpiderOutput()
def craw(self, root_url):
count = 1
self.urls.add_new_url(root_url)
#当有待爬取的url时
while self.urls.has_new_url():
try:
#获取新的url
new_url = self.urls.get_new_url()
print('craw %d : %s' %(count, new_url))
#下载新的url页面
html_count = self.downloader.download(new_url)
#将url和所在的页面内容通过parser函数得到新的new_url数据和new_data数据
new_urls,new_data = self.parser.parser(new_url,html_count)
#将新的url数据补充到url管理器当中
self.urls.add_new_urls(new_urls)
#将新的数据存放到output文件当中
self.output.collect_data(new_data)
if count == 1000:
break
count = count + 1
except:
print('craw failed!')
self.output.output_html()
if __name__=="__main__":
root_url = "http://baike.baidu.com/view/21087.htm"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
url_manager(url管理器)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/8/29 2:35
# @Author: jecht
# @File : url_manager.py
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
#管理器中新添加的url
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
#管理器中新添加批量的url
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
#判断管理器中是否有新的待爬取url
def has_new_url(self):
return len(self.new_urls) != 0
#从管理器中获取一个新的待爬取的url
def get_new_url(self):
new_url =self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
url_downloader(url下载器)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/8/29 2:36
# @Author: jecht
# @File : url_download.py
import urllib.request
class UrlDownload(object):
def download(self, url):
if url is None:
return None
response = urllib.request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
url_parser(url分析器)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/8/29 15:41
# @Author: jecht
# @File : url_parser.py
import re
from anaconda_project.plugins.network_util import urlparse
from bs4 import BeautifulSoup
class UrlParser(object):
def _get_new_urls(self,page_url,soup):
new_urls =set()
links = soup.find_all('a',href=re.compile(r"/view/\d+\.htm"))
for link in links:
new_url = link['href']
#拼接url
new_full_url = urlparse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data={}
#url
res_data['url'] = page_url
#<div class="lemma-summary" label-module="lemmaSummary">
title_node =soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title'] = title_node.get_text()
summary_node = soup.find('div', class_="lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data
def parser(self, page_url, html_count):
if page_url is None or html_count is None:
return
soup = BeautifulSoup(html_count,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(page_url,soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
spider_output(内容输出器)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/8/29 12:33
# @Author: jecht
# @File : spider_output.py
class SpiderOutput(object):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout = open('output.html','w')
fout.write("<html>")
fout.write("<body>")
fout.write("<table>")
#ascii
for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data['url'])
fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))
fout.write("</tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()
vir虚拟环境及scrapy的安装
windows(wrapper)
windows:(建议安装,虚拟环境管理包)
安装虚拟环境:pip install virtualenvwrapper-win
修改虚拟环境目录:计算机——右键属性——高级系统设置——环境变量——新建变量——
变量名:WORKON_HOME/变量值:D:\python_project\
新建虚拟环境并进入:mkvirtualenv scrapy_virtualenv(通过--python c:\***来指定python不同版本的虚拟环境)
查看虚拟环境:workon
进入虚拟环境:workon scrapy_test
退出虚拟环境:deactivate
安装虚拟环境:pip install virtualenvwrapper-win
修改虚拟环境目录:计算机——右键属性——高级系统设置——环境变量——新建变量——
变量名:WORKON_HOME/变量值:D:\python_project\
新建虚拟环境并进入:mkvirtualenv scrapy_virtualenv(通过--python c:\***来指定python不同版本的虚拟环境)
查看虚拟环境:workon
进入虚拟环境:workon scrapy_test
退出虚拟环境:deactivate
在虚拟环境中安装scrapy(需要安装Visual C++ Build Tools)
pip install -i https://pypi.douban.com/simple scrapy
查看依赖包:pip list
进入虚拟环境:workon scrapy_virtualenv
新建scrapy项目:scrapy startproject ArticleSpider
(New Scrapy project 'ArticleSpider', using template directory
'd:\\envs\\article_scrapytest\\lib\\site-packages\\scrapy\\templates\\project',
created in:e:\virtualenv_project\ArticleSpider)
现在可以用pycharm打开scrapy项目进行爬虫编写
进入项目创建scrapy工程:scrapy genspider jobbole blog.jobbole.com(通过basic的模板创建jobble.py文件)
pip install -i https://pypi.douban.com/simple scrapy
查看依赖包:pip list
进入虚拟环境:workon scrapy_virtualenv
新建scrapy项目:scrapy startproject ArticleSpider
(New Scrapy project 'ArticleSpider', using template directory
'd:\\envs\\article_scrapytest\\lib\\site-packages\\scrapy\\templates\\project',
created in:e:\virtualenv_project\ArticleSpider)
现在可以用pycharm打开scrapy项目进行爬虫编写
进入项目创建scrapy工程:scrapy genspider jobbole blog.jobbole.com(通过basic的模板创建jobble.py文件)
pycharm使用
(设置全局编译器)file——*settings——*project interpreter——*add local——*添加虚拟环境的python.exe文件
scrapy shell http://blog.jobbole.com/110287/ 可以在shell环境下调试网页中scrapy代码
respose.body 查看页面所有内容
response.css("")
response.xpath("")
(设置全局编译器)file——*settings——*project interpreter——*add local——*添加虚拟环境的python.exe文件
scrapy shell http://blog.jobbole.com/110287/ 可以在shell环境下调试网页中scrapy代码
respose.body 查看页面所有内容
response.css("")
response.xpath("")
windows
安装虚拟环境
windows:(需要pip和python环境,pip下载解压后再cmd中运行python setup.py install,并把python\Scripts路径加到path中)
安装虚拟环境:pip install virtualenv
新建虚拟环境:virtualenv --python /usr/bin/python3 scrapy_test(使用python的真实路径)
进入环境运行环境:cd scrapy_test\
activate.bat
deactivate.bat
在虚拟环境中安装scrapy:pip install -i https://pypi.douban.com/simple scrapy
在虚拟环境中你所需要的目录下安装project目录:scrapy startproject scrapy_bolezaixian
在虚拟环境中的project目录下安装pypiwin32 :pip install pypiwin32
windows:(需要pip和python环境,pip下载解压后再cmd中运行python setup.py install,并把python\Scripts路径加到path中)
安装虚拟环境:pip install virtualenv
新建虚拟环境:virtualenv --python /usr/bin/python3 scrapy_test(使用python的真实路径)
进入环境运行环境:cd scrapy_test\
activate.bat
deactivate.bat
在虚拟环境中安装scrapy:pip install -i https://pypi.douban.com/simple scrapy
在虚拟环境中你所需要的目录下安装project目录:scrapy startproject scrapy_bolezaixian
在虚拟环境中的project目录下安装pypiwin32 :pip install pypiwin32
linux
linux:
安装虚拟环境:yum -y install python-virtualenv
新建虚拟环境:virtualenv -p /usr/bin/python3 scrapy_test(virtualenv scrapy_test)
进入环境运行环境:cd scrapy_test/bin
source activate
source deactivate
安装虚拟环境:yum -y install python-virtualenv
新建虚拟环境:virtualenv -p /usr/bin/python3 scrapy_test(virtualenv scrapy_test)
进入环境运行环境:cd scrapy_test/bin
source activate
source deactivate
linux(wrapper版scrapy)
安装依赖包
yum install libxslt-devel libffi libffi-devel python-devel gcc openssl openssl-devel
安装依赖包
yum install libxslt-devel libffi libffi-devel python-devel gcc openssl openssl-devel
tar xf pyOpenSSL-0.11.tar.gz -C /usr/local/src/
python setup.py install
**************************************************************************
安装Twisted
wget http://pypi.python.org/packages/source/s/setuptools/setuptools-0.6c11.tar.gz -P /opt/
tar zxvf setuptools-0.6c11.tar.gz -C /usr/local/src/
python setup.py install
python setup.py install
安装scrapy
easy_install -U Scrapy
easy_install -U Scrapy
**************************************************************************
安装Twisted
wget http://pypi.python.org/packages/source/s/setuptools/setuptools-0.6c11.tar.gz -P /opt/
tar zxvf setuptools-0.6c11.tar.gz -C /usr/local/src/
python setup.py install
安装lxml
easy_install Twisted
easy_install -U w3lib
easy_install lxml
easy_install Twisted
easy_install -U w3lib
easy_install lxml
安装pyOpenSSL
wget http://launchpadlibrarian.net/58498441/pyOpenSSL-0.11.tar.gz -P /opt/
wget http://launchpadlibrarian.net/58498441/pyOpenSSL-0.11.tar.gz -P /opt/
字段分析语法
xpath语法
xpath语法
article 选取所有article元素的所有子节点
/article 选取article元素
//* 选取所有元素
//article 选取所有article元素
//@class 选取所有class属性
article/a 选取所有article元素的子元素a元素
article//a 选取所有article元素的后代元素a元素
//article/a |//article/b 选取所有article元素下的a和b元素
xpath语法-谓语
/article/a[1] 选取article元素下的第一个a元素
/article/a[last()] 选取article元素下的倒数第一个a元素
/article/a[last()-1] 选取article元素下的倒数第二个a元素
//a[@class] 选取所有拥有class属性的a元素
//a[@class='eng'] 选取所有拥有class属性等于eng的a元素
//a[@*] 选取所有拥有属性的a元素
xpath语法
text() :以文本格式输出data数据值
selector方法extract() :只输出data值
strip() :除去换行,回车,空格等标识符输出list值
replace(“。”,“”) :替换 把。换成空格
contains() :包含()内函数就匹配成功
@href :等同于::attr()
title = response.xpath('//*[@id="post-111585"]/div[1]/h1/text()').extract()[0]
creat_date = response.xpath('//*[@id="post-111585"]/div[2]/p/text()').extract()[0].replace("·","").strip()
praise = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
page_url = response.xpath('//div[@class="grid-8"]/div/div/a/@href').extract()
text() :以文本格式输出data数据值
selector方法extract() :只输出data值
strip() :除去换行,回车,空格等标识符输出list值
replace(“。”,“”) :替换 把。换成空格
contains() :包含()内函数就匹配成功
@href :等同于::attr()
title = response.xpath('//*[@id="post-111585"]/div[1]/h1/text()').extract()[0]
creat_date = response.xpath('//*[@id="post-111585"]/div[2]/p/text()').extract()[0].replace("·","").strip()
praise = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
page_url = response.xpath('//div[@class="grid-8"]/div/div/a/@href').extract()
xpath语法
article 选取所有article元素的所有子节点
/article 选取article元素
//* 选取所有元素
//article 选取所有article元素
//@class 选取所有class属性
article/a 选取所有article元素的子元素a元素
article//a 选取所有article元素的后代元素a元素
//article/a |//article/b 选取所有article元素下的a和b元素
xpath语法-谓语
/article/a[1] 选取article元素下的第一个a元素
/article/a[last()] 选取article元素下的倒数第一个a元素
/article/a[last()-1] 选取article元素下的倒数第二个a元素
//a[@class] 选取所有拥有class属性的a元素
//a[@class='eng'] 选取所有拥有class属性等于eng的a元素
//a[@*] 选取所有拥有属性的a元素
css语法
title = response.css(".entry-header h1::text").extract()
datetime = response.css('"p.entry-met-hide-on-moblile::text').extract()[0].strip().replace(".","").strip()
page_url = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
css选择器
* 选取所有节点
#container 选取所有id=“container”的节点
.container 选取所有class包含container的节点
container a 选取所有container元素下的所有a节点(子节点)
container + ul 选取container元素后面的ul节点(兄弟节点)
div#container>ul 选取id=“container”的div元素的第一个ul子元素
ul ~ p 选取与ul相邻的所有p元素(后面的后面)
a[title] 选取所有包含title属性的a元素
a[href="http://jobbole.com"] 选取所有href="http://jobbole.com"的a元素
a[href*="jobole"] 选取所有href包含jobole的a元素
a[href^="http" 选取所有href以http开头的a元素
a[href$=".jpg"] 选取所有href以.jpg结尾的a元素
input[type=radio]:checked 选取checked状态的type=“radio”的input元素
div:not(#container) 选取所有id不等于container的div属性
li:nth-child(3) 选取第三个li元素
ty:nth-child(2n) 选取第偶数个tr元素
ty:nth-child(-n+4) 选取小于等于4个元素
ty:nth-child(n+4) 选取大于等于4个元素
ty:last-child(1) 选取最后一个ty元素
css语法
::text :等同于xpath中的text()
::attr(href) :获取href=“”的数据
extract()[0] extract_first("") :当数组为空时会报错,最好用后者代替
yield :讲给scrapy下载
Request(url="",callback="") :讲url地址,回调到另一个或自身的解析函数中
parse.urljoin(response.url,post_url) :讲两个不同的url去重拼接
::text :等同于xpath中的text()
::attr(href) :获取href=“”的数据
extract()[0] extract_first("") :当数组为空时会报错,最好用后者代替
yield :讲给scrapy下载
Request(url="",callback="") :讲url地址,回调到另一个或自身的解析函数中
parse.urljoin(response.url,post_url) :讲两个不同的url去重拼接
title = response.css(".entry-header h1::text").extract()
datetime = response.css('"p.entry-met-hide-on-moblile::text').extract()[0].strip().replace(".","").strip()
page_url = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
css选择器
* 选取所有节点
#container 选取所有id=“container”的节点
.container 选取所有class包含container的节点
container a 选取所有container元素下的所有a节点(子节点)
container + ul 选取container元素后面的ul节点(兄弟节点)
div#container>ul 选取id=“container”的div元素的第一个ul子元素
ul ~ p 选取与ul相邻的所有p元素(后面的后面)
a[title] 选取所有包含title属性的a元素
a[href="http://jobbole.com"] 选取所有href="http://jobbole.com"的a元素
a[href*="jobole"] 选取所有href包含jobole的a元素
a[href^="http" 选取所有href以http开头的a元素
a[href$=".jpg"] 选取所有href以.jpg结尾的a元素
input[type=radio]:checked 选取checked状态的type=“radio”的input元素
div:not(#container) 选取所有id不等于container的div属性
li:nth-child(3) 选取第三个li元素
ty:nth-child(2n) 选取第偶数个tr元素
ty:nth-child(-n+4) 选取小于等于4个元素
ty:nth-child(n+4) 选取大于等于4个元素
ty:last-child(1) 选取最后一个ty元素
2.使用scrapy爬取伯乐在线网页
scrapy爬取伯乐在线网页
main函数(在scrapy_spide总目录下)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/9/3 14:21
# @Author: jecht
# @File : main.py
import sys
import os
from scrapy.cmdline import execute
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","jobbole"])
#设置settings.py中ROBOTSTXT_OBEY为False并注释,不然爬虫很多url会被过滤掉
from scrapy.cmdline import excute #调用后可执行scrapy脚本
excute(["scrapy","crawl","jobbole"]) #在scrapy环境下执行scrapy crawl jobbole命令
import sys
sys.path.append #设置工程目录
import os # 系统模块,用于创建结果文件保存等
os.path.abspath(__file__) #当前文件路径
os.path.dirname(os.patg.abspath()) #当前文件所在的目录
excute(["scrapy","crawl","jobbole"]) #在scrapy环境下执行scrapy crawl jobbole命令
import sys
sys.path.append #设置工程目录
import os # 系统模块,用于创建结果文件保存等
os.path.abspath(__file__) #当前文件路径
os.path.dirname(os.patg.abspath()) #当前文件所在的目录
spiders(所有spider的总目录)
jobbole.py(通过模板scrapy genspider jobbole blog.jobbole.com创建)
# -*- coding: utf-8 -*-
import re
from urllib import parse
import scrapy
from scrapy import Request
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self,response):
def parse_page(self,response):
pass
文章的列表页爬取,并把每篇文章的url和下一页的url交给scrapy下载
def parse(self, response):
post_nodes = response.xpath('//div[@class="grid-8"]/div/div/a')
for post_node in post_nodes:
post_urls = post_node.xpath('@href').extract_first("")
image_url = post_node.xpath('img/@src').extract_first("")
#page_url = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
yield Request(url=parse.urljoin(response.url,post_url),meta=("front_image_url":image_url),callback=self.parse_page)
next_url = response.xpath('//div[contains(@class,"navigation")]/a[@class="next page-numbers"]/@href').extract()[0]
if next_url:
yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
post_nodes = response.xpath('//div[@class="grid-8"]/div/div/a')
for post_node in post_nodes:
post_urls = post_node.xpath('@href').extract_first("")
image_url = post_node.xpath('img/@src').extract_first("")
#page_url = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
yield Request(url=parse.urljoin(response.url,post_url),meta=("front_image_url":image_url),callback=self.parse_page)
next_url = response.xpath('//div[contains(@class,"navigation")]/a[@class="next page-numbers"]/@href').extract()[0]
if next_url:
yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
.extract()[0] #选取有效字段第一个[1]表示第二个可以换成extract_first("")
yield #scrapy下载
Request(url= ,callback= ) #请求url的内容,并将url交给eicallback的函数进行解析或再次进行下载
parse.urljoin(baseurl,url) #将baseurl与rl拼接成整的url
yield #scrapy下载
Request(url= ,callback= ) #请求url的内容,并将url交给eicallback的函数进行解析或再次进行下载
parse.urljoin(baseurl,url) #将baseurl与rl拼接成整的url
每个页面的item的爬取
通过分别定义每个item的内容进行加载
def parse_page(self, response):
jobboleItem = JobboleScrapyItem()
font_image_url = response.meta.get("font_image_urls","")
title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
datetime = response.xpath('//div[@class="entry-meta"]/p/text()').extract_first("").strip().replace("·","").strip()
prase_number =response.xpath('//div[@class="post-adds"]/span[contains(@class,"vote-post-up")]/h10/text()').extract_first("")
collections_number = response.xpath('//div[@class="post-adds"]/span[contains(@class,"bookmark-btn")]/text()').extract_first("")
collections_re = re.match(".*?(\d+).*",collections_number)
if collections_re is None:
collections_number = 0
else:
collections_number = collections_re.group(1)
comments_number = response.xpath('//div[@class="post-adds"]/a/span[contains(@class,"hide-on-480")]/text()').extract_first("")
#提取两位数及以上的数字
comments_re = re.match(".*?(\d+).*",comments_number)
if comments_re is None:
comments_number = 0
else:
comments_number = comments_re.group(1)
targe_list =response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
targes = [targe for targe in targe_list if not targe.strip().endswith('评论')]
targe_list = ",".join(targes)
content = response.xpath('//div[@class="entry"]').extract()
#实例化item
jobboleItem["title"] = title
try:
create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
jobboleItem["create_date"] = create_date
jobboleItem["prase_number"] = prase_number
jobboleItem["collections_number"] = collections_number
jobboleItem["comments_number"] = comments_number
jobboleItem["targe_list"] = targe_list
jobboleItem["content"] = content
jobboleItem["front_image_url"] = [front_image_url]
jobboleItem["url"] = response.url
jobboleItem["url_object_id"] = get_md5.get_md5(response.url)
jobboleItem = JobboleScrapyItem()
font_image_url = response.meta.get("font_image_urls","")
title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
datetime = response.xpath('//div[@class="entry-meta"]/p/text()').extract_first("").strip().replace("·","").strip()
prase_number =response.xpath('//div[@class="post-adds"]/span[contains(@class,"vote-post-up")]/h10/text()').extract_first("")
collections_number = response.xpath('//div[@class="post-adds"]/span[contains(@class,"bookmark-btn")]/text()').extract_first("")
collections_re = re.match(".*?(\d+).*",collections_number)
if collections_re is None:
collections_number = 0
else:
collections_number = collections_re.group(1)
comments_number = response.xpath('//div[@class="post-adds"]/a/span[contains(@class,"hide-on-480")]/text()').extract_first("")
#提取两位数及以上的数字
comments_re = re.match(".*?(\d+).*",comments_number)
if comments_re is None:
comments_number = 0
else:
comments_number = comments_re.group(1)
targe_list =response.xpath('//div[@class="entry-meta"]/p/a/text()').extract()
targes = [targe for targe in targe_list if not targe.strip().endswith('评论')]
targe_list = ",".join(targes)
content = response.xpath('//div[@class="entry"]').extract()
#实例化item
jobboleItem["title"] = title
try:
create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
jobboleItem["create_date"] = create_date
jobboleItem["prase_number"] = prase_number
jobboleItem["collections_number"] = collections_number
jobboleItem["comments_number"] = comments_number
jobboleItem["targe_list"] = targe_list
jobboleItem["content"] = content
jobboleItem["front_image_url"] = [front_image_url]
jobboleItem["url"] = response.url
jobboleItem["url_object_id"] = get_md5.get_md5(response.url)
字段分析函数:parse(self,response)
.exextract_first("") #选取有效字段第一个[1]表示第二个可以换成extract()[0]。
.strip() #去除换行等格式符
.replace("。","") #替换。为空格
comments_number =comments.group(1) #group(1)截取第一个字段
[ele for ele in targe_list if not ele.strip().endwith('评论')] #在targe_list数组中,排除有评论结尾的单元
.exextract_first("") #选取有效字段第一个[1]表示第二个可以换成extract()[0]。
.strip() #去除换行等格式符
.replace("。","") #替换。为空格
comments_number =comments.group(1) #group(1)截取第一个字段
[ele for ele in targe_list if not ele.strip().endwith('评论')] #在targe_list数组中,排除有评论结尾的单元
通过itemloader加载item
# 通过itemloader加载item
front_image_url = response.meta.get("front_image_urls", "")
item_loaders = JobboleLoaderItem(item=JobboleScrapyItem(),response=response)
item_loaders.add_xpath("title", '//div[@class="entry-header"]/h1/text()')
item_loaders.add_xpath("create_date", '//div[@class="entry-header"]/h1/text()')
item_loaders.add_value("front_image_url", [front_image_url])
item_loaders.add_xpath("comments_number",'//div[@class="post-adds"]/a/span[contains(@class,"hide-on-480")]/text()')
item_loaders.add_xpath("collections_number",'//div[@class="post-adds"]/span[contains(@class,"bookmark-btn")]/text()')
item_loaders.add_xpath("prase_number", '//div[@class="post-adds"]/span[contains(@class,"vote-post-up")]/h10/text()')
item_loaders.add_xpath("targe_list", '//div[@class="entry-meta"]/p/a/text()')
item_loaders.add_xpath("content", '//div[@class="entry"]')
item_loaders.add_value("url", response.url)
item_loaders.add_value("url_object_id", get_md5.get_md5(response.url))
jobboleItem = item_loaders.load_item()
yield jobboleItem
实例化item模板,并将填充后yield的数据交给pipeline下载
1.在item下编写jobbole的实例化项目
通过自定义item进行item Filed
class JobboleScrapyItem(scrapy.Item):
front_imagr_url = scrapy.Field()
title = scrapy.Field()
datetime = scrapy.Field()
prase_number = scrapy.Field()
collections_number = scrapy.Field()
comments_number = scrapy.Field()
targe_list = scrapy.Field()
content = scrapy.Field()
front_image_path = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
将所有需要填充的数据加入item中,并规范数据类类型为filed
scrapy的数据类型只有filed
scrapy的数据类型只有filed
通过itemloader加载item
import datetime
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose,TakeFirst,Join
class ScrapyProjectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
#定义自己的itemloader,将list转化为str,并通过TakeFirst取到第一个
def date_convert(value):
try:
create_date = datetime.datetime.strptime(value,"%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_number(value):
collections_re = re.match(".*?(\d+).*", value)
if collections_re is None:
value_number = "0"
else:
value_number = collections_re.group(1)
return value_number
def delect_comment(value):
if "评论" in value:
return ""
else:
return value
def return_value(value):
return value
class JobboleLoaderItem(ItemLoader):
default_output_processor = TakeFirst()
class JobboleScrapyItem(scrapy.Item):
front_image_url = scrapy.Field(
output_processor = MapCompose(return_value)
)
title = scrapy.Field()
create_date = scrapy.Field(
input_processor = MapCompose(date_convert)
)
prase_number = scrapy.Field(
input_processor = MapCompose(get_number)
)
collections_number = scrapy.Field(
input_processor = MapCompose(get_number)
)
comments_number = scrapy.Field(
input_processor = MapCompose(get_number)
)
targe_list = scrapy.Field(
input_processor = MapCompose(delect_comment),
output_processor = Join(",")
)
content = scrapy.Field()
front_image_path = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
2.开启在setting中的pipeline文件
#ITEM_PIPELINE = {
'Article.Spider.pipelines.ArticlespiderPipeline':300,
'scrapy.pipelines.images.ImagesPipeline':1,
}
#Image保存功能
IMAGE_URLS_FIELD = 'front_image_url'
project_path = os.path.abspath(os.path.dirname(__path__))
IMAGE_STORE = os.path.join(project_path,images/jobbole_images)
#(由于下载图片需要TIL插件,进入虚拟环境下安装pillow)
#(pip install -i https://pypi.douban.com/simple pillow)
#ITEM_PIPELINE = {
'Article.Spider.pipelines.ArticlespiderPipeline':300,
'scrapy.pipelines.images.ImagesPipeline':1,
}
#Image保存功能
IMAGE_URLS_FIELD = 'front_image_url'
project_path = os.path.abspath(os.path.dirname(__path__))
IMAGE_STORE = os.path.join(project_path,images/jobbole_images)
#(由于下载图片需要TIL插件,进入虚拟环境下安装pillow)
#(pip install -i https://pypi.douban.com/simple pillow)
1.在setting中设置你的pipeline的优先使用权
2.使用指定os.path功能指定image的路径
3.'scrapy.pipelines.images.ImagesPipeline'
定制化你的图片,可以在pipeline中重新定义一个class,在class中引用images包中的ImagePipeline,使用该class的函数功能,实现控制图片文件的格式,和过滤非重要的图片
4.IMAGE_URLS_FIELD,IMG_STORE
2.使用指定os.path功能指定image的路径
3.'scrapy.pipelines.images.ImagesPipeline'
定制化你的图片,可以在pipeline中重新定义一个class,在class中引用images包中的ImagePipeline,使用该class的函数功能,实现控制图片文件的格式,和过滤非重要的图片
4.IMAGE_URLS_FIELD,IMG_STORE
交给pipelines爬取数据并下载到相应文件或数据库
设置settings开启执行pipeline的顺序
将图片保存到本地文件夹
1.在settings中定义图片下载到本地文件的路径
class ScrapyImagePipeline(ImagesPipeline):
#将image_url通过for循环,凑成一个request交给yield下载
# get_media_requests
#获取文件实际下载地址
def item_completed(self, results, item, info):
for ok, value in results:
value_image_path = value["path"]
item["front_image_path"] = value_image_path
return item
pass
IMAGES_URLS_FIELD = 'front_image_url'2.引用ImagePipeline将文件下载后,自定义item_completed获取文件实际下载地址
project_path = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_path,'images/jobbole_images')
class ScrapyImagePipeline(ImagesPipeline):
#将image_url通过for循环,凑成一个request交给yield下载
# get_media_requests
#获取文件实际下载地址
def item_completed(self, results, item, info):
for ok, value in results:
value_image_path = value["path"]
item["front_image_path"] = value_image_path
return item
pass
将文件下载到本地生成json文件
自定义下载到本地json文件
import codecs
import json
class ScrapyJsonPipeline(object):
def __init__(self):
self.file =codecs.open('scrapy_jobbole.json','w',encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_close(self, spider):
self.file.close()
利用scrapy下载到本地json文件
from scrapy.exporters import JsonItemExporter
class ScrapyItemExportersPipeline(object):
def __init__(self):
self.file = open("scrapy_exporter.json", "wb")
self.exporters = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporters.start_exporting()
def close_spider(self, spider):
self.exporters.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporters.export_item(item)
return item
数据下载到mysql数据库中
通过同步操作将数据库同步到mysql
import MySQLdb
class ScrapyMysqlExporterPipline(object):
def __init__(self):
self.conn = MySQLdb.connect("127.0.0.1","root","wuting123","scrapy_jobbole",charset="utf8",use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self,item,spider):
insert_sql = '''
insert into jobbole(title,create_date,url,url_object_id,front_image_url,front_image_path,comments_number,
prase_number,collections_number,content,targe_list)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''
self.cursor.execute(insert_sql,(item["title"],item["create_date"],item["url"],item["url_object_id"],
item["front_image_url"],item["front_image_path"],item["comments_number"],item["prase_number"],
item["collections_number"],item["content"],item["targe_list"]))
self.conn.commit()
通过异步操作将数据同步到mysql
import MySQLdb
import MySQLdb.cursors
class ScrapyMysqlTwistedPipeline(object):
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings["MYSQL_IP"],
user = settings["MYSQL_USER"],
password = settings["MYSQL_PASSWD"],
db = settings["MYSQL_DBNAME"],
charset = "utf8",
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return cls(dbpool)
def process_item(self,item,spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error, item, spider)#处理异常
def handle_error(selfself,failure,item,spider):
print(failure)
def do_insert(self,cursor,item):
#执行具体插入操作
insert_sql = '''
insert into jobbole(title,create_date,url,url_object_id,front_image_url,front_image_path,comments_number,
prase_number,collections_number,content,targe_list)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''
cursor.execute(insert_sql, (item["title"], item["create_date"], item["url"], item["url_object_id"],
item["front_image_url"], item["front_image_path"], item["comments_number"], item["prase_number"],
item["collections_number"], item["content"],item["targe_list"]))
3.使用scrapy登陆知乎爬取知乎网站
知乎登录
使用request库登录知乎
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/9/20 9:59
# @Author: jecht
# @File : zhihu_login_request.py
import time
import cookiejar
import requests
from PIL import Image
import re
try:
import cookielib
except:
import http.cookiejar as cookielib
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
try:
session.cookies.load(ignore_discard=True)
except:
print("cookie未能加载")
url = "https://www.zhihu.com"
#agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safar
i/537.36 Edge/14.14393"
agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari
/537.36"
header = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
'User-Agent': agent,
}
#zhihu_login_request("17512009387", "wuting123")
#get_captcha()
#get_xsrf()
#get_index()
#get_ingore()
获取xsrf
def get_xsrf():
response = session.get(url, headers=header)
# text = '<input type="hidden" name="_xsrf" value="9af460db3704806af07819ad14626e08"/>'
match = re.match('[\s\S]*name="_xsrf" value="?(.*)"', response.text)
if match:
return match.group(1)
else:
return ""
response = session.get(url, headers=header)
# text = '<input type="hidden" name="_xsrf" value="9af460db3704806af07819ad14626e08"/>'
match = re.match('[\s\S]*name="_xsrf" value="?(.*)"', response.text)
if match:
return match.group(1)
else:
return ""
获取captcha验证码
def get_captcha():
t = str(int(time.time()*1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login&lang=cn"
response = session.get(captcha_url,headers=header)
with open("captcha_image.gif", 'wb')as f:
f.write(response.content)
try:
img = Image.open("captcha.jpg")
img.show()
img.close()
except:
pass
points = [[20.7735, 22.7614], [45.7835, 22.6225], [66.7824, 22.6114], [90.7914, 21.6136], [118.7937, 23.6114], [143.7936, 22.6185], [160.7935, 22.6125]]
#points = ["%5B20.77%2C22.76%5D", "%5B45.78%2C22.62%5D", "%5B66.78%2C22.61%5D", "%5B90.79%2C21.61%5D", "%5B118.79%2C23.61%5D", "%5B143.79%2C22.61%5D", "%5B160.79%2C22.61%5D"]
seq = input('请输入倒立文字的位置\n>')
s = ""
for i in seq:
#s += str(points[int(i)-1]) + "%2C"
s += str(points[int(i) - 1]) + ", "
#captcha_base = '%7B%22img_size%22%3A%5B200%2C44%5D%2C%22input_points%22%3A%5B' + s[:-3] + '%5D%7D'
captcha_base = '{"img_size":[200,44],"input_points":[' + s[:-2] + ']}'
return captcha_base
t = str(int(time.time()*1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login&lang=cn"
response = session.get(captcha_url,headers=header)
with open("captcha_image.gif", 'wb')as f:
f.write(response.content)
try:
img = Image.open("captcha.jpg")
img.show()
img.close()
except:
pass
points = [[20.7735, 22.7614], [45.7835, 22.6225], [66.7824, 22.6114], [90.7914, 21.6136], [118.7937, 23.6114], [143.7936, 22.6185], [160.7935, 22.6125]]
#points = ["%5B20.77%2C22.76%5D", "%5B45.78%2C22.62%5D", "%5B66.78%2C22.61%5D", "%5B90.79%2C21.61%5D", "%5B118.79%2C23.61%5D", "%5B143.79%2C22.61%5D", "%5B160.79%2C22.61%5D"]
seq = input('请输入倒立文字的位置\n>')
s = ""
for i in seq:
#s += str(points[int(i)-1]) + "%2C"
s += str(points[int(i) - 1]) + ", "
#captcha_base = '%7B%22img_size%22%3A%5B200%2C44%5D%2C%22input_points%22%3A%5B' + s[:-3] + '%5D%7D'
captcha_base = '{"img_size":[200,44],"input_points":[' + s[:-2] + ']}'
return captcha_base
表单登陆
def zhihu_login_request(account, password):
if re.match("^1\d{10}", account):
print("手机号码登录")
post_url = "https://www.zhihu.com/login/phone_num"
post_data = {
"_xsrf": get_xsrf(),
"captcha": get_captcha(),
"captcha_type": 'cn',
"password": password,
"phone_num": account,
"remember_me": 'true',
}
else:
if "@" in account:
print("邮箱登录")
post_url = "https://www.zhihu.com/login/email"
post_data = {
"_xsrf": get_xsrf(),
"captcha": get_captcha(),
"captcha_type": 'cn',
"password": password,
"email": account,
"remember_me": 'true',
}
response = session.post(post_url,data=post_data, headers=header)
session.cookies.save()
if re.match("^1\d{10}", account):
print("手机号码登录")
post_url = "https://www.zhihu.com/login/phone_num"
post_data = {
"_xsrf": get_xsrf(),
"captcha": get_captcha(),
"captcha_type": 'cn',
"password": password,
"phone_num": account,
"remember_me": 'true',
}
else:
if "@" in account:
print("邮箱登录")
post_url = "https://www.zhihu.com/login/email"
post_data = {
"_xsrf": get_xsrf(),
"captcha": get_captcha(),
"captcha_type": 'cn',
"password": password,
"email": account,
"remember_me": 'true',
}
response = session.post(post_url,data=post_data, headers=header)
session.cookies.save()
加载cookie
def get_index():
response = session.get(url, headers=header)
with open("zhihu_index.html","wb") as f:
f.write(response.text.encode("utf-8"))
print("已从cookie加载登录信息")
response = session.get(url, headers=header)
with open("zhihu_index.html","wb") as f:
f.write(response.text.encode("utf-8"))
print("已从cookie加载登录信息")
判断是否登录?
def get_ingore():
ignore_url = "https://www.zhihu.com/inbox"
response = session.get(ignore_url,headers = header)
if response.status_code != 200:
return False
else:
return True
ignore_url = "https://www.zhihu.com/inbox"
response = session.get(ignore_url,headers = header)
if response.status_code != 200:
return False
else:
return True
使用scrapy的request登陆知乎
# -*- coding: utf-8 -*-
import json
import re
import scrapy
import time
from PIL import Image
import scrapy_project
import requests
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
header = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.
3112.90 Safari/537.36",
}
def parse(self, response):
pass
def start_requests(self):
return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.header, callback=self.login)]
return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.header, callback=self.login)]
def login(self,response):
response_text = response.text
re_xsrf = re.match('[\s\S]*name="_xsrf" value="?(.*)"', response_text)
get_xsrf = ""
if re_xsrf:
get_xsrf = re_xsrf.group(1)
if get_xsrf:
post_url = 'https://www.zhihu.com/login/phone_num'
post_data = {
"_xsrf": get_xsrf,
"captcha": "",
"captcha_type": 'cn',
"password": "wuting123",
"phone_num": "17512009387",
"remember_me": 'true'
}
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + str(int(time.time() * 1000)) + "&type=login&lang=cn"
return [scrapy.Request(captcha_url, headers=self.header, meta={'post_data': post_data}, callback=self.login_after)]
response_text = response.text
re_xsrf = re.match('[\s\S]*name="_xsrf" value="?(.*)"', response_text)
get_xsrf = ""
if re_xsrf:
get_xsrf = re_xsrf.group(1)
if get_xsrf:
post_url = 'https://www.zhihu.com/login/phone_num'
post_data = {
"_xsrf": get_xsrf,
"captcha": "",
"captcha_type": 'cn',
"password": "wuting123",
"phone_num": "17512009387",
"remember_me": 'true'
}
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + str(int(time.time() * 1000)) + "&type=login&lang=cn"
return [scrapy.Request(captcha_url, headers=self.header, meta={'post_data': post_data}, callback=self.login_after)]
编写登陆函数,函数中获取登入所需的xsrf,以及空白的captcha字段
通过scrapy.Request()函数,回调给下一个login_after函数,并通过meta传入post_data数据
通过scrapy.Request()函数,回调给下一个login_after函数,并通过meta传入post_data数据
def login_after(self,response):
with open("captcha.gif", 'wb')as f:
f.write(response.body)
try:
img = Image.open("captcha.gif")
img.show()
img.close()
except:
pass
points = [[20.7735, 22.7614], [45.7835, 22.6225], [66.7824, 22.6114], [90.7914, 21.6136], [118.7937, 23.6114],[143.7936, 22.6185], [160.7935, 22.6125]]
seq = input('请输入倒立文字的位置\n>')
s = ""
for i in seq:
s += str(points[int(i) - 1]) + ", "
captcha = '{"img_size":[200,44],"input_points":[' + s[:-2] + ']}'
post_data = response.meta.get("post_data",{})
post_data['captcha'] = captcha
#post_data = json.dumps(post_data)
if post_data:
return [scrapy.FormRequest(
url='https://www.zhihu.com/login/phone_num',
formdata=post_data,
headers=self.header,
callback=self.check_login
)]
with open("captcha.gif", 'wb')as f:
f.write(response.body)
try:
img = Image.open("captcha.gif")
img.show()
img.close()
except:
pass
points = [[20.7735, 22.7614], [45.7835, 22.6225], [66.7824, 22.6114], [90.7914, 21.6136], [118.7937, 23.6114],[143.7936, 22.6185], [160.7935, 22.6125]]
seq = input('请输入倒立文字的位置\n>')
s = ""
for i in seq:
s += str(points[int(i) - 1]) + ", "
captcha = '{"img_size":[200,44],"input_points":[' + s[:-2] + ']}'
post_data = response.meta.get("post_data",{})
post_data['captcha'] = captcha
#post_data = json.dumps(post_data)
if post_data:
return [scrapy.FormRequest(
url='https://www.zhihu.com/login/phone_num',
formdata=post_data,
headers=self.header,
callback=self.check_login
)]
通过引入Image函数,实现验证码图片的查看功能
通过抓包,查看到
通过抓包,查看到
def check_login(self,response):
login_view = response.text
if "errcode" in login_view:
print("登入失败")
else:
print ("登入成功")
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, headers=self.header)
login_view = response.text
if "errcode" in login_view:
print("登入失败")
else:
print ("登入成功")
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, headers=self.header)
爬取知乎数据
1.爬取总览页
import json
import re
from urllib import parse
import scrapy
import time
from PIL import Image
import scrapy_project
import requests
from scrapy.loader import ItemLoader
from scrapy_project.items import ZhihuQuestionItem, ZhihuAnswerItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/']
#起始请求url
start_answer_url = 'http://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}'
header = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
}
def parse(self, response):
page_url = response.css('a[data-za-detail-view-element_name="Title"]::attr(href)').extract()
for url in page_url:
url = parse.urljoin(response.url, url)
if "question" in url:
re_url = re.match('(.*)answer.*', url)
url = re_url.group(1)
question_id = re.match('.*/(\d+).*', url).group(1)
#解析出来的url保存并传递给parse_page解析
yield scrapy.Request(url, headers=self.header, meta={"questions_id": question_id}, callback=self.question_parse)
1.headers可以通过setting中定义默认的,否则每次request时都需要定义headers
2.通过parser函数,爬取总览页,并爬取每一个详情页的url进行yield给详情页分析函数
3.并通过next_url进行循环给自身,一直爬取,直到爬取到最后一页
2.通过parser函数,爬取总览页,并爬取每一个详情页的url进行yield给详情页分析函数
3.并通过next_url进行循环给自身,一直爬取,直到爬取到最后一页
2.爬取详情页
def question_parse(self,response):
if "QuestionHeader" in response.text:
que_question_id = response.meta.get('questions_id', '')
Item_Que_Loaders = ItemLoader(item=ZhihuQuestionItem(), response=response)
Item_Que_Loaders.add_value('que_question_id',que_question_id)
Item_Que_Loaders.add_css('que_topic', 'div.QuestionTopic span a div div::text')
Item_Que_Loaders.add_css('que_title', 'h1.QuestionHeader-title::text')
Item_Que_Loaders.add_css('que_content', 'div.QuestionHeader-detail div div span')
Item_Que_Loaders.add_css('que_attention_num', 'button.NumberBoard-item div.NumberBoard-value::text')
Item_Que_Loaders.add_css('que_view_num', 'div.NumberBoard-item div.NumberBoard-value::text')
Item_Que_Loaders.add_css('que_comment_num', 'div.QuestionHeader-Comment button::text')
Item_Que_Loaders.add_css('que_answer_num', 'h4.List-headerText span::text')
Item_Que_Loaders.add_value('que_url', response.url)
Question_Item = Item_Que_Loaders.load_item()
yield scrapy.Request(self.start_answer_url.format(que_question_id, 20, 0), headers=self.header, callback=self.answer_parse)
yield Question_Item
def answer_parse(self, response):
json_data = json.loads(response.text)
is_end = json_data['paging']['is_end']
next_url = json_data['paging']['next']
total_answer_num = json_data['paging']['totals']
#ans_data = json_data['data']
for ans_data in json_data['data']:
Item_Ans_Loaders = ZhihuAnswerItem()
Item_Ans_Loaders['ans_author_name'] = ans_data['author']['name']
Item_Ans_Loaders['ans_author_idname'] = ans_data['author']['url_token'] if "url_token" in ans_data["author"] else None
Item_Ans_Loaders['ans_data_url'] = ans_data['url']
Item_Ans_Loaders['ans_question_id'] = ans_data['question']['id']
Item_Ans_Loaders['ans_voters_num'] = ans_data['voteup_count']
Item_Ans_Loaders['ans_comment_num'] = ans_data['comment_count']
Item_Ans_Loaders['ans_content'] = ans_data['content']
Item_Ans_Loaders['ans_create_time'] = ans_data['created_time']
Item_Ans_Loaders['ans_update_time'] = ans_data['updated_time']
yield Item_Ans_Loaders
if not is_end:
yield scrapy.Request(next_url, headers=self.header, callback=self.answer_parse)
1.详情页包括:问题详情页、回答详情页
2.详情页的字节爬取依靠scrapy.loader中的ItemLoader()函数进行loader,此时loader中的value属于list
3.通过yield将不同的question_id的answer页面传递到下一个answer_parse函数中,传递的answer页面是一个有answer内容的json页面
4.由于answer内容是json格式,可以通过json.loads()格式化json页面到json_data,通过json_data['paging']['is_end']获取数据
2.详情页的字节爬取依靠scrapy.loader中的ItemLoader()函数进行loader,此时loader中的value属于list
3.通过yield将不同的question_id的answer页面传递到下一个answer_parse函数中,传递的answer页面是一个有answer内容的json页面
4.由于answer内容是json格式,可以通过json.loads()格式化json页面到json_data,通过json_data['paging']['is_end']获取数据
3.编写item
子主题
4.编写pipeline
4.使用scrapy的crawlspider爬取拉钩网
使用crawl模板,不是用默认的basic模板
1.进入项目所在文件夹
cd d:\python_project\scrapy_project\scrapy_project
2.进入虚拟环境
workon scrapy_virtualenv
2.查看spider模板列表
scprapy genspider --list
3.使用crawl模板
scrapy genspider -t crawl lagou www.lagou.com
cd d:\python_project\scrapy_project\scrapy_project
2.进入虚拟环境
workon scrapy_virtualenv
2.查看spider模板列表
scprapy genspider --list
3.使用crawl模板
scrapy genspider -t crawl lagou www.lagou.com
CrawlSpider源码分析
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool(
'CRAWLSPIDER_FOLLOW_LINKS', True)
return spider
def set_crawler(self, crawler):
super(CrawlSpider, self).set_crawler(crawler)
self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
1.CrawlSpider类继承了__init__中的Spider类,Spider类的入口函数为start_requests()
def start_requests(self):
cls = self.__class__
if method_is_overridden(cls, Spider, 'make_requests_from_url'):
warnings.warn(
"Spider.make_requests_from_url method is deprecated; it "
"won't be called in future Scrapy releases. Please "
"override Spider.start_requests method instead (see %s.%s)." % (
cls.__module__, cls.__name__
),
)
for url in self.start_urls:
yield self.make_requests_from_url(url)
else:
for url in self.start_urls:
yield Request(url, dont_filter=True)
2.start_requests函数默认的返回函数为parse函数,parse函数调用了_parse_response函数
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
if callback为if parse_start_url函数存在,则将cb_kwargs参数传入callback函数中得到一个数组
数组又传入process_result函数中,由于初定义的两个函数为空,cb_res为空数组,后面可以自定义
for循环将判断过可迭代的参数cb_res进行迭代,然后进行yield下载,如果两个自定义的函数没重写,则_parse_response基本属于没有调用
数组又传入process_result函数中,由于初定义的两个函数为空,cb_res为空数组,后面可以自定义
for循环将判断过可迭代的参数cb_res进行迭代,然后进行yield下载,如果两个自定义的函数没重写,则_parse_response基本属于没有调用
if _parse_reponse中默认的follw=True和settings设置中('CRAWLSPIDER_FLLOW_LINKS'=True)
没有修改,则执行下面的for循环,通过linkextractor的函数遍历links,否则无法调用rule规则
没有修改,则执行下面的for循环,通过linkextractor的函数遍历links,否则无法调用rule规则
3._parse_response函数允许scrapy中自定义调用parse_start_url和process_results对response进行处理
4. 并通过_requests_to_follow调用scrapy中的rule,将response结果交给了rule中的LinkExtractor的extract_links方法用于将link全部抽取出来,并对每一个link进行一次类似yield scrapy.request(r)的功能
5.在yield之前在r内通过_build_request再到_response_downloaded加入了处理
4. 并通过_requests_to_follow调用scrapy中的rule,将response结果交给了rule中的LinkExtractor的extract_links方法用于将link全部抽取出来,并对每一个link进行一次类似yield scrapy.request(r)的功能
5.在yield之前在r内通过_build_request再到_response_downloaded加入了处理
def parse_start_url(self, response):
return []
def process_results(self, response, results):
return results
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
1.首先判断response是否是一个response对象
2.seen是一个set(),后面通过for循环将response中的url提取并去重
3._rules通过_compile_rules函数将每条rule进行添加callback和一些预处理方法
4.将rules通过enumerate函数变成一个可迭代的rules,得到一个n或者rule
5.links是等于拿着link_extractor在response中使用extracr_links方法进行抽取link(或者说url),并确定link不在seen中
6.如果对使用extract_links方法匹配link_extractor的结果link不满意的话,可以再使用process_links函数再进行过滤一遍
7.将最终确定数据全部加入到seen中
8.通过process_request和build_requestdownload下来
2.seen是一个set(),后面通过for循环将response中的url提取并去重
3._rules通过_compile_rules函数将每条rule进行添加callback和一些预处理方法
4.将rules通过enumerate函数变成一个可迭代的rules,得到一个n或者rule
5.links是等于拿着link_extractor在response中使用extracr_links方法进行抽取link(或者说url),并确定link不在seen中
6.如果对使用extract_links方法匹配link_extractor的结果link不满意的话,可以再使用process_links函数再进行过滤一遍
7.将最终确定数据全部加入到seen中
8.通过process_request和build_requestdownload下来
6.LinkExtractor根据传进来的参数,例如(allow=r'Item/')或deny、allow_domains等其他参数
7._build_request到_response_downloaded的处理是将response中的rule提取出来
7._build_request到_response_downloaded的处理是将response中的rule提取出来
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=False,
unique=True, process_value=None, deny_extensions=None, restrict_css=(),
strip=True)
def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r
def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
最后又返回给_pare_response函数了
Rule和LinkExtractor的参数用法
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
Rule的参数
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
1.link_extractor:基本交给extract_link使用
2.callback:回调函数,在logo.py中定义的
3.cb_kwargs:传递给link_extractor的参数
4.follow:判断是否跟踪满足本条rule的url
5.process_links:对links传入预处理函数
6.process_request=identity:identity是一个可以自定义的空函数,类似于process_links
2.callback:回调函数,在logo.py中定义的
3.cb_kwargs:传递给link_extractor的参数
4.follow:判断是否跟踪满足本条rule的url
5.process_links:对links传入预处理函数
6.process_request=identity:identity是一个可以自定义的空函数,类似于process_links
LinkExtractor
LinkExtractor的参数
class LxmlLinkExtractor(FilteringLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=False,
unique=True, process_value=None, deny_extensions=None, restrict_css=(),
strip=True):
1.allow:符合正则表达式中的字段的url进行提取
2.deny:符合正则表达式中的字段的url进行舍弃
3.allow_domains:符合lagou.py中allowed_domains的字段的url进行提取
4.deny_domains:符合lagou.py中allowed_domains的字段的url进行舍弃
5.restrict_xpaths:限定xpath字符处理时的路径范围
6.tags=('a', 'area'):默认值;默认通过a标签和area标签中寻找url
7.attrs=('href',):默认值;默认提取href值
8.restrict_css=():限定css字符处理时的路径范围
2.deny:符合正则表达式中的字段的url进行舍弃
3.allow_domains:符合lagou.py中allowed_domains的字段的url进行提取
4.deny_domains:符合lagou.py中allowed_domains的字段的url进行舍弃
5.restrict_xpaths:限定xpath字符处理时的路径范围
6.tags=('a', 'area'):默认值;默认通过a标签和area标签中寻找url
7.attrs=('href',):默认值;默认提取href值
8.restrict_css=():限定css字符处理时的路径范围
xpath处理xml,css处理html,当使用css时,会从继承的FilteringExtractor类中条用HTMLTranslator()去将css转化成xpath
LinkExtractor的extract_links函数
def extract_links(self, response):
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
return unique_list(all_links)
1.根据response的url调用get_base_url函数去获取url
2.如果有设置restrict_xpath,则进行遍历路径进行xpath处理
3.得到的符合路径的字段形成list
4.将list进行遍历,分别执行_extract_links()匹配之前link_extractor的字段,形成一个links
2.如果有设置restrict_xpath,则进行遍历路径进行xpath处理
3.得到的符合路径的字段形成list
4.将list进行遍历,分别执行_extract_links()匹配之前link_extractor的字段,形成一个links
编写rule和linkextractor进行爬取拉钩全站
编写items
class LagouItemLoader(ItemLoader):
default_output_processor = TakeFirst()
class LagouItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
url_object = scrapy.Field()
degree_need = scrapy.Field()
crawl_time = scrapy.Field()
publish_time = scrapy.Field()
targs = scrapy.Field()
company_name = scrapy.Field()
company_url = scrapy.Field()
job_city = scrapy.Field()
job_type = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field()
salsry_max = scrapy.Field()
salsry_min = scrapy.Field()
work_years_max = scrapy.Field()
work_years_min = scrapy.Field()
default_output_processor = TakeFirst()
class LagouItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
url_object = scrapy.Field()
degree_need = scrapy.Field()
crawl_time = scrapy.Field()
publish_time = scrapy.Field()
targs = scrapy.Field()
company_name = scrapy.Field()
company_url = scrapy.Field()
job_city = scrapy.Field()
job_type = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field()
salsry_max = scrapy.Field()
salsry_min = scrapy.Field()
work_years_max = scrapy.Field()
work_years_min = scrapy.Field()
在lagou.py中解析字段
子主题
5.爬虫与反爬虫策略
1.随机更换User-Agent
因为request变成response时都需要经过downloader middleware,通过设置middleware,
在request转化成response时,将UserAgent
(可以借鉴源码site-package/scrapy/downloadermiddlewares/useragent.py)
#激活middleware
在settings中:
DOWNLOADER_MIDDLEWARES = {
'scrapy_project.middlewares.MyCustomDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
在request转化成response时,将UserAgent
(可以借鉴源码site-package/scrapy/downloadermiddlewares/useragent.py)
#激活middleware
在settings中:
DOWNLOADER_MIDDLEWARES = {
'scrapy_project.middlewares.MyCustomDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
1.通过自建user_agent_list,使用random.randint获取随机UserAgent
在settings中需要设置一个user_agent_list的列表
在middleware.py中:
import Random
class RandomUserAgentMiddlware(object):
def __init__(self,crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.user_agent_list = crawler.settings.get("user_agent_list",)
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
random_num = random.randint(0, len(self.user_agent_list) - 1)
random_agent = self.user_agent_list[random_num]
request.headers.setdeafult('User-Agent',random_agent)
在middleware.py中:
import Random
class RandomUserAgentMiddlware(object):
def __init__(self,crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.user_agent_list = crawler.settings.get("user_agent_list",)
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
random_num = random.randint(0, len(self.user_agent_list) - 1)
random_agent = self.user_agent_list[random_num]
request.headers.setdeafult('User-Agent',random_agent)
(推荐!)1.通过github上的fake-useragent进行随机获取useragent
pip install fake-useragent
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
def __init__(self,crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua=UserAgent()
self.user_type = crawler.settings.get('RANDOM_UA_TYPE',"Random")
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
def get_ua():
return getattr(self.ua,self.ua_type)
request.headers.setdefault('User-Agent',get_ua())
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
def __init__(self,crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua=UserAgent()
self.user_type = crawler.settings.get('RANDOM_UA_TYPE',"Random")
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
def get_ua():
return getattr(self.ua,self.ua_type)
request.headers.setdefault('User-Agent',get_ua())
1.super(,self).__init__():使子类调用父类的属性并进行初始化函数
2.使用代理ip
使用ip代理
使用西刺免费的高匿ip代理将主机ip隐藏
填入代理ip与端口
request.meta["proxy"] = "https://183.71.136.98:8118"
填入代理ip与端口
request.meta["proxy"] = "https://183.71.136.98:8118"
使用ip代理池
自己写一个爬虫,爬取西刺代理网站的高匿免费ip代理crawl_xici_ip.py
import datetime
import requests
from scrapy.selector import Selector
import MySQLdb
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="wuting123", db="scrapy_lagou", charset="utf8")
cursor = conn.cursor()
def crawl_ips():
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"}
for i in range(1,5):
re = requests.get("http://www.xicidaili.com/nn/{0}".format(i),headers=headers)
selector = Selector(text=re.text)
all_trs = selector.css("#ip_list tr")
ip_list = []
for tr in all_trs[1:]:
speed_str = tr.css(".bar::attr(title)").extract()[0]
if speed_str:
speed = float(speed_str.split("秒")[0])
text = tr.css("td::text").extract()
all_text = list()
for i in text:
if '\n' not in i:
all_text.append(i)
ip = all_text[0]
port = all_text[1]
proxy_type = all_text[3]
validate = "20"+ all_text[5]
validate_time = datetime.datetime.strptime(validate,"%Y-%m-%d %H:%M")
ip_list.append((ip,port,proxy_type,speed,validate_time))
for ip_add in ip_list:
cursor.execute(
"insert proxy_ip(ip, port, proxy_type, speed, validate_time) VALUES('{0}','{1}','{2}','{3}','{4}')".format(
ip_add[0],ip_add[1],ip_add[2],ip_add[3],ip_add[4]
)
)
conn.commit()
#从数据库中获取ip
class GetIP(object):
#删除测试后无法使用的ip
def delete_ip(self,ip):
delect_sql = """
DELETE FROM proxy_ip WHERE ip = '{0}'
""".format(ip)
cursor.execute(delect_sql)
conn.commit()
#判断ip是否可用
def judge_ip(self,ip,port,proxy_type):
http_url = "http://www.baidu.com"
proxy_url = "{0}://{1}:{2}".format(proxy_type,ip,port)
try:
proxy_dirt = {
"http":proxy_url,
"https":proxy_url
}
response = requests.get(http_url,proxies=proxy_dirt)
except Exception as e:
print("invalid ip and port")
print(proxy_url)
self.delete_ip(ip)
return False
else:
code =response.status_code
if code >= 200 and code < 300:
print (proxy_url)
print("effective ip")
return True
else:
print("invalid ip and port")
self.delete_ip(ip)
return False
# 使用SQL语句从数据库中随机获取数据
def get_random_ip(self):
random_sql = '''SELECT ip,port,proxy_type FROM proxy_ip
ORDER BY RAND()
LIMIT 1
'''
result = cursor.execute(random_sql)
for ip_add in cursor.fetchall():
ip = ip_add[0]
port = ip_add[1]
proxy_type = ip_add[2]
judge_ip = self.judge_ip(ip,port,proxy_type)
if judge_ip:
return "{0}://{1}:{2}".format(proxy_type,ip,port)
else:
return self.get_random_ip()
if __name__ == "__main__":
get_ip = GetIP()
get_ip.get_random_ip()
import requests
from scrapy.selector import Selector
import MySQLdb
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="wuting123", db="scrapy_lagou", charset="utf8")
cursor = conn.cursor()
def crawl_ips():
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"}
for i in range(1,5):
re = requests.get("http://www.xicidaili.com/nn/{0}".format(i),headers=headers)
selector = Selector(text=re.text)
all_trs = selector.css("#ip_list tr")
ip_list = []
for tr in all_trs[1:]:
speed_str = tr.css(".bar::attr(title)").extract()[0]
if speed_str:
speed = float(speed_str.split("秒")[0])
text = tr.css("td::text").extract()
all_text = list()
for i in text:
if '\n' not in i:
all_text.append(i)
ip = all_text[0]
port = all_text[1]
proxy_type = all_text[3]
validate = "20"+ all_text[5]
validate_time = datetime.datetime.strptime(validate,"%Y-%m-%d %H:%M")
ip_list.append((ip,port,proxy_type,speed,validate_time))
for ip_add in ip_list:
cursor.execute(
"insert proxy_ip(ip, port, proxy_type, speed, validate_time) VALUES('{0}','{1}','{2}','{3}','{4}')".format(
ip_add[0],ip_add[1],ip_add[2],ip_add[3],ip_add[4]
)
)
conn.commit()
#从数据库中获取ip
class GetIP(object):
#删除测试后无法使用的ip
def delete_ip(self,ip):
delect_sql = """
DELETE FROM proxy_ip WHERE ip = '{0}'
""".format(ip)
cursor.execute(delect_sql)
conn.commit()
#判断ip是否可用
def judge_ip(self,ip,port,proxy_type):
http_url = "http://www.baidu.com"
proxy_url = "{0}://{1}:{2}".format(proxy_type,ip,port)
try:
proxy_dirt = {
"http":proxy_url,
"https":proxy_url
}
response = requests.get(http_url,proxies=proxy_dirt)
except Exception as e:
print("invalid ip and port")
print(proxy_url)
self.delete_ip(ip)
return False
else:
code =response.status_code
if code >= 200 and code < 300:
print (proxy_url)
print("effective ip")
return True
else:
print("invalid ip and port")
self.delete_ip(ip)
return False
# 使用SQL语句从数据库中随机获取数据
def get_random_ip(self):
random_sql = '''SELECT ip,port,proxy_type FROM proxy_ip
ORDER BY RAND()
LIMIT 1
'''
result = cursor.execute(random_sql)
for ip_add in cursor.fetchall():
ip = ip_add[0]
port = ip_add[1]
proxy_type = ip_add[2]
judge_ip = self.judge_ip(ip,port,proxy_type)
if judge_ip:
return "{0}://{1}:{2}".format(proxy_type,ip,port)
else:
return self.get_random_ip()
if __name__ == "__main__":
get_ip = GetIP()
get_ip.get_random_ip()
在settings和middleware中,调用西刺代理ip
DOWNLOADER_MIDDLEWARES = {
'scrapy_project.middlewares.RandomUserAgentMiddlware': 10,
'scrapy_project.middlewares.RandomProxyMiddleware': 11,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
'scrapy_project.middlewares.RandomUserAgentMiddlware': 10,
'scrapy_project.middlewares.RandomProxyMiddleware': 11,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
middleware
class RandomProxyMiddleware(object):
def process_request(self,request,spider):
get_ip = GetIP()
request.meta["proxy"] = get_ip.get_random_ip()
class RandomProxyMiddleware(object):
def process_request(self,request,spider):
get_ip = GetIP()
request.meta["proxy"] = get_ip.get_random_ip()
使用github上的开源库scrapy-proxies
功能比自己的写的更强大更齐全
使用github上官方收费的开源库scrapy-crawlera
使用洋葱网络
将ip经过多次的转发后达到隐藏ip的功能(但是需要用到vpn)
3.验证码识别
1.google开源工具tesseract-ocr
缺点:干扰比较大
2.在线打码
云打码
3.人工打码
超速打码
4.限速
http://scrapy-chs.readthedocs.io/zh_CN/0.24/topics/autothrottle.html
1.在settings中设置最低的下载延迟:
DOWNLOAD_DELAY
2.在settings中设置更高的并发数:
CONCURRENT_REQUESTS_PER_DOMAIN 或( CONCURRENT_REQUESTS_PER_IP )
3.在settings中设置自动限速
AUTOTHROTTLE_ENABLED 启用autothrottle模式
AUTOTHROTTLE_START_DELAY 设置自动限速的初始延迟(单位:秒)
AUTOTHROTTLE_MAX_DELAY 设置自动限速的最大延迟(单位:秒)
AUTOTHROTTLE_DEBUG 启用自动限速的调试模式
AUTOTHROTTLE_START_DELAY 设置自动限速的初始延迟(单位:秒)
AUTOTHROTTLE_MAX_DELAY 设置自动限速的最大延迟(单位:秒)
AUTOTHROTTLE_DEBUG 启用自动限速的调试模式
5.在不同情况下设置不同的settings
1.默认情况下不需要登陆状态,是不需要开启cookie值的,所以需要在settings中设置COOKIES_ENABLED = False
2.如知乎,需要的另外开启cookie状态的,则只需要在spider中的zhihu.py直接设置custom_settings={
COOKIES_ENABLED = True
}
COOKIES_ENABLED = True
}
6.selenium动态网问网站
1.了解selinium
1.安装selenium
在虚拟环境下
pip install selenium
pip install selenium
百度selenium python api,找到官方文档,查找不同浏览器的drivers
(firefox的试了很多次,需要对应版本都没成功)
(使用了google的driver,地址:http://npm.taobao.org/mirrors/chromedriver/)
(环境:python3.6.1,selinum 3.11,google浏览器65.0,chromedriver_win32.zip:2.37)
(firefox的试了很多次,需要对应版本都没成功)
(使用了google的driver,地址:http://npm.taobao.org/mirrors/chromedriver/)
(环境:python3.6.1,selinum 3.11,google浏览器65.0,chromedriver_win32.zip:2.37)
2.使用selinum的webdriver模块,自动加载网页
from selenium import webdriver
browser = webdriver.Chrome(executable_path="D:\python_project\selenium_drviers\chromedriver.exe")
browser.get("https://item.taobao.com/item.htm?spm=a230r.1.14.71.31207d9buGsqEv&id=561063544221&ns=1&abbucket=8#detail")
print(browser.page_source)
browser = webdriver.Chrome(executable_path="D:\python_project\selenium_drviers\chromedriver.exe")
browser.get("https://item.taobao.com/item.htm?spm=a230r.1.14.71.31207d9buGsqEv&id=561063544221&ns=1&abbucket=8#detail")
print(browser.page_source)
3.selenium的字段分析
1.使用scrapy自带的xml的字段分析
from scrapy.selector import Selector
t_selector = Selector(text=bowser.page_source)
t_selector.xpath( )
t_selector.css( )
t_selector = Selector(text=bowser.page_source)
t_selector.xpath( )
t_selector.css( )
2.使用selinum的字段分析
browser.find_ele
2.使用selinum的webdriver.chrom()加载页面
1.用selnium模拟登陆知乎
from selenium import webdriver
from scrapy.selector import Selector
browser = webdriver.Chrome(executable_path="D:\python_project\selenium_drviers\chromedriver.exe")
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".Login-content input[name = 'username']").send_keys("17512009387")
browser.find_element_by_css_selector(".Login-content input[name = 'password']").send_keys("wuting123")
browser.find_element_by_css_selector("button.SignFlow-submitButton").click()
from scrapy.selector import Selector
browser = webdriver.Chrome(executable_path="D:\python_project\selenium_drviers\chromedriver.exe")
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".Login-content input[name = 'username']").send_keys("17512009387")
browser.find_element_by_css_selector(".Login-content input[name = 'password']").send_keys("wuting123")
browser.find_element_by_css_selector("button.SignFlow-submitButton").click()
2.使用selinium爬取微博
1.使用微博开放平台:http://open.weibo.com/wiki/%E9%A6%96%E9%A1%B5
2.遇到的问题:无法定位元素
1.因为存在iframe框架,需要跳转网页,所以可能无法立马定位元素
可以通过time.sleep(5),来通过5秒延时加载数据
可以通过time.sleep(5),来通过5秒延时加载数据
3.登陆微博
(碰到验证码可以手动输入)
(碰到验证码可以手动输入)
def weibo():
url = "https://weibo.com/"
browser.get("https://weibo.com/")
time.sleep(5)
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='username_box'] input[name = 'username']").send_keys("15870635250")
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='password_box'] input[name = 'password']").send_keys("tumeihong")
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='normal_form'] div.info_list.login_btn a[node-type = 'submitBtn']").click()
url = "https://weibo.com/"
browser.get("https://weibo.com/")
time.sleep(5)
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='username_box'] input[name = 'username']").send_keys("15870635250")
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='password_box'] input[name = 'password']").send_keys("tumeihong")
browser.find_element_by_css_selector("div.WB_miniblog div[node-type='normal_form'] div.info_list.login_btn a[node-type = 'submitBtn']").click()
4.模拟下拉刷新(适用于javascript)
使用javascripts代码
使用javascripts代码
for i in range(3):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage")
time.sleep(3)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage")
time.sleep(3)
window.scrollTo():把窗口调整到某个位置
document.body.scrollHeight:表示body标签最大可以滚动到的坐标
3.使用selinum设置chromdriver不加载图片
#不加载图片
chrom_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chrom_opt.add_experimental_option("prefs",prefs)
browser = webdriver.Chrome(executable_path = "D:\python_project\selenium_drviers\chromedriver.exe",chrome_options=chrom_opt)
browser.get("http://www.taobao.com")
chrom_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chrom_opt.add_experimental_option("prefs",prefs)
browser = webdriver.Chrome(executable_path = "D:\python_project\selenium_drviers\chromedriver.exe",chrome_options=chrom_opt)
browser.get("http://www.taobao.com")
chrom_opt:调用chromOptions的方法
prefs: 修改chromdriver的图片设置为2,表示不显示图片
导入外置的设置到chromOptions类中:chrom_opt.add_experimental_option()
把chrom_options=设置导入browser实例里
prefs: 修改chromdriver的图片设置为2,表示不显示图片
导入外置的设置到chromOptions类中:chrom_opt.add_experimental_option()
把chrom_options=设置导入browser实例里
注意:webdriver.chrom()不能放在函数内,会闪退。可通过try......exception设置多个browser
3.使用无界面浏览器:phantomjs
(多进程情况下phantomjs性能会下降很严重)
(多进程情况下phantomjs性能会下降很严重)
1.安装phantomjs
1.下载地址:http://phantomjs.org/,
解压后放入D:/python_scrapy/phantomjs
解压后放入D:/python_scrapy/phantomjs
2.配置phantomjs
browser = webdriver.PhantomJS(executable_path = "D:\python_project\selenium_drviers\chromedriver.exe")
browser.get("http://www.baidu.com")
print (browser.page_sourse)
browser.quit()
browser.get("http://www.baidu.com")
print (browser.page_sourse)
browser.quit()
不显示图面,自动化运行
4.将selinum集成到scrapy中
子主题
通过建立middleware中间件
0 条评论
下一页