scarpy crawl 爬取微信小程序文章

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from wxapp.items import WxappItem

class WxSpider(CrawlSpider):

    name = 'wx'

    allowed_domains = ['wxapp-union.com']

    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (

        Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=\d+'), follow=True),

        Rule(LinkExtractor(allow=r'.*article-.+\.html'), callback='parse_detail', follow=False),

    )

    def parse_detail(self, response):

        detail_href = response.request.url

        title = response.xpath('//h1[@class="ph"]/text()').get()

        content = response.xpath('//td[@id="article_content"]//text()').getall()

        content = [c.strip() for c in content]

        content = ''.join(content).strip()

        pub_time = response.xpath('//p[@class="authors"]/span/text()').get()

        author = response.xpath('//p[@class="authors"]/a/text()').get()

        item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)

        yield item

from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter

class WxappPipeline(object):

    def __init__(self):

        """

        爬虫开始的时候执行

        """

        self.fp = open("data.json", 'wb')

        self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')

    def open_spider(self, spider):

        """

        爬虫开始的时候执行

        :param spider:

        :return:

        """

        pass

    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item

    def close_spider(self, spider):

        """

        爬虫结束的时候执行

        :param spider:

        :return:

        """

        self.fp.close()

import scrapy

class WxappItem(scrapy.Item):

    title = scrapy.Field()

    content = scrapy.Field()

    pub_time = scrapy.Field()

    author = scrapy.Field()

    detail_href = scrapy.Field()

scarpy crawl 爬取微信小程序 文章的相关教程结束。

《scarpy crawl 爬取微信小程序文章.doc》

下载本文的Word格式文档，以方便收藏与打印。

scarpy crawl 爬取微信小程序文章

scarpy crawl 爬取微信小程序 文章的相关教程结束。

相关推荐

valgrind 配合 gdb 调试程序

微信小程序来了，小程序都能做些什么

Linux vim程序编辑器

转：CURL库在程序中的运用浅析

VS2022使用ClickOnce发布程序本地安装.net框架

DOS系统中EXE程序加载过程

Spring MVC-处理程序映射（Handler Mapping）-控制器类名称处理程序映射（Controller Class Name Handler Mapping）示例（转载实践）

.Net Core 控制台程序错误：Can not find runtime target for framework '.NETCoreApp,Version=v1.0' compatible with one of the target runtimes: 'win10-x64, win81-x64, win8-x64, win7-x64'.