接着博客往下走 上篇博客地址
一、更新代码
vim ITtest.py
import scrapy
from qiushi.items import QiushiItem #导入糗事项目下items中QiushiItem函数
from scrapy.http.response.html import HtmlResponse #导入HtmlXPathSelector模块
from scrapy.selector.unified import SelectorList #导入SelectorList模块
import urllib
import os
class IttestSpider(scrapy.Spider):
name = 'ITtest'
allowed_domains = ['www.qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
bash_domain = "https://www.qiushibaike.com"
def parse(self, response):
body = response.xpath('//div[@class="col1 old-style-col1"]/div')
for duanzhi in body:
touxiang = duanzhi.xpath('.//div//@src').get()
neirong = duanzhi.xpath('.//div[@class="content"]//text()').getall()
neirong = "".join(neirong).strip()
zuozhe = duanzhi.xpath('.//div//h2/text()').get().strip()
item = QiushiItem(头像=touxiang,作者=zuozhe,内容=neirong)
#判断文件夹是否存在,无则创建
path_dir = os.path.dirname(os.getcwd()) + '/img/'
if not os.path.exists(path_dir):
os.mkdir(path_dir)
if zuozhe and touxiang:
print(zuozhe,touxiang)
file_path = os.path.join(path_dir, zuozhe + '.jpg')
if not os.path.exists(file_path):
#os.mknod创建空文件
os.mknod(file_path)
print(file_path)
# #urllib.urlretrieve 直接将远程数据下载到本地
urllib.request.urlretrieve('http:'+touxiang, file_path)
yield item
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
if not next_url:
return
else:
yield scrapy.Request(self.bash_domain+next_url,callback=self.parse)
二、再次爬虫
scrapy crawl ITtest
三、查看爬取数据
四、打包压缩传输到windows机器中
zip -r img.zip img/
查看img文件
本文地址:https://blog.csdn.net/qq_37377136/article/details/107239874