本文共 13886 字,大约阅读时间需要 46 分钟。
(p3scrapy) [vagrant@reboot vagrant]$ scrapy startproject ArticleSpiderYou can start your first spider with: cd ArticleSpider scrapy genspider example example.com(p3scrapy) [vagrant@reboot ArticleSpider]$ scrapy genspider jobbole blog.jobbole.com
(p3scrapy) [vagrant@reboot ArticleSpider]$ tree ..├── ArticleSpider│ ├── images│ │ └── full│ ├── __init__.py│ ├── items.py│ ├── middlewares.py│ ├── pipelines.py│ ├── settings.py│ ├── spiders│ │ ├── __init__.py│ │ ├── jobbole.py├── main.py└── scrapy.cfg
CREATE TABLE `article` ( `title` varchar(200) NOT NULL, `create_date` date DEFAULT NULL, `url` varchar(300) NOT NULL, `url_object_id` varchar(50) NOT NULL, `front_image_url` varchar(300) DEFAULT NULL, `front_image_path` varchar(200) DEFAULT NULL, `praise_nums` int(11) DEFAULT NULL, `fav_nums` int(11) DEFAULT NULL, `comment_nums` int(11) DEFAULT NULL, `tags` varchar(200) DEFAULT NULL, `content` longtext NOT NULL, PRIMARY KEY (`url_object_id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8;
# -*- coding: utf-8 -*-import scrapy, refrom scrapy.http import Requestfrom urllib import parsefrom ArticleSpider.items import JobBoleArticleItemfrom ArticleSpider.utils.common import get_md5from datetime import datetimefrom scrapy.loader import ItemLoaderfrom ArticleSpider.items import JobBoleArticleItem, ArticleItemLoaderclass JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://blog.jobbole.com/all-posts/'] def parse(self, response): """ 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse :param response: :return: """ post_nodes = response.css("#archive .floated-thumb .post-thumb a") for post_node in post_nodes: image_url = post_node.css("img::attr(src)").extract_first("") post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail) # 提取下一页并进行下载 next_urls= response.css(".next.page-numbers::attr(href)").extract_first("") if next_urls: yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse) def parse_detail(self, response): # article_item = JobBoleArticleItem() # # 提取文章的具体字段 # # re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1") # # re2_selector = response.xpath('//*[@id="post-110287"]/div[1]/h1') # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace('·','').strip() # praise_nums = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first("")) # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css('a[href="#article-comment"] span::text').extract_first("") # comment_re = re.match(".*?(\d+).*", comment_nums) # comment_nums = int(comment_re.group(1)) if comment_re else 0 # # if comment_re: # # comm_nums = int(comment_re.group(1)) # # else: # # comm_nums = 0 # content = response.xpath('//div[@class="entry"]').extract()[0] # # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [ element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # print(e.args) # create_date = datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlfrom scrapy.loader import ItemLoaderfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinimport datetimeimport reimport scrapyclass ArticlespiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() passclass ArticleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()def date_convert(value): try: create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() return create_datedef get_nums(value): match_re = re.match(".*?(\d+).*", value) if match_re: nums = int(match_re.group(1)) else: nums = 0 return numsdef remove_comment_tags(value): #去掉tag中提取的评论 if "评论" in value: return "" else: return valuedef return_value(value): return valueclass JobBoleArticleItem(scrapy.Item): # title = scrapy.Field() # create_date = scrapy.Field() # url = scrapy.Field() # front_image_url = scrapy.Field() # front_image_path = scrapy.Field() # praise_nums = scrapy.Field() # fav_nums = scrapy.Field() # comment_nums = scrapy.Field() # tags = scrapy.Field() # content = scrapy.Field() # url_object_id = scrapy.Field() title = scrapy.Field() create_date = scrapy.Field( # input_processor=MapCompose(date_convert), ) url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field() def get_insert_sql(self): # insert_sql = """ # insert into article(title, url, create_date, fav_nums) # VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums) # """ # params = (self["title"], self["url"], self["create_date"], self["fav_nums"]) insert_sql = """ insert into article(title, url, create_date, fav_nums, url_object_id, praise_nums,comment_nums,tags, content, front_image_url, front_image_path) VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s) """ params = (self['title'], self['url'],self['create_date'],self['fav_nums'], self['url_object_id'], self['praise_nums'], self['comment_nums'], self['tags'], self['content'], self['front_image_url'], self['front_image_path']) return insert_sql, params
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapy.pipelines.images import ImagesPipelineimport codecs, jsonfrom scrapy.exporters import JsonItemExporterimport MySQLdbimport MySQLdb.cursorsfrom twisted.enterprise import adbapiclass ArticlespiderPipeline(object): def process_item(self, item, spider): return itemclass JsonWithEncodingPipeline(object): # 自定义json文件的导出 """写入json文件""" def __init__(self): self.file = codecs.open('article.json', 'w', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(lines) return item def spider_closed(self,spider): self.file.close()class MysqlPipeline(object): # 采用同步的方式写入mysql数据库 def __init__(self): self.conn = MySQLdb.connect('127.0.0.1', 'root', '123456', 'articlespider', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into article(title, url, create_date, fav_nums, url_object_id, front_image_path, praise_nums,comment_nums,tags,content,front_image_url) VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s) """ self.cursor.execute(insert_sql, (item['title'], item['url'],item['create_date'],item['fav_nums'], item['url_object_id'], item['front_image_path'],item['praise_nums'],item['comment_nums'],item['tags'],item['content'],item['front_image_url'])) self.conn.commit()class MysqlTwistedPipeline(object): # 采用异步的方式写入mysql数据库 def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host=settings["MYSQL_HOST"], user=settings["MYSQL_USER"], password=settings["MYSQL_PASSWORD"], db=settings["MYSQL_DBNAME"], charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) # 处理异常 query.addErrback(self.handle_error) def handle_error(self, failure): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): # 执行具体的插入逻辑 # insert_sql = """ # insert into article(title, url, create_date, fav_nums, url_object_id, front_image_path, praise_nums,comment_nums,tags, content, front_image_url) # VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s) # """ # cursor.execute(insert_sql, (item['title'], item['url'],item['create_date'],item['fav_nums'], item['url_object_id'], item['front_image_path'], item['praise_nums'], item['comment_nums'], item['tags'], item['content'], item['front_image_url'])) insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params)class JsonItemExporterPipeline(object): # 调用scrapy提供的json export导出json文件 def __init__(self): self.file = codecs.open('articleport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return itemclass ArticleImagePipeline(ImagesPipeline): # 自定义图片下载 # def item_completed(self, results, item, info): # for ok, value in results: # image_file_path = value['path'] # item['front_image_path'] = image_file_path # return item def item_completed(self, results, item, info): print(item) if "front_image_url" in item: for ok, value in results: image_file_path = value["path"] print(image_file_path) item["front_image_path"] = image_file_path return item
添加如下
MYSQL_HOST = "127.0.0.1"MYSQL_DBNAME = "articlespider"MYSQL_USER = "root"MYSQL_PASSWORD = "123456"ROBOTSTXT_OBEY = Falseimport sys, osBASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))sys.path.insert(0, os.path.join(BASE_DIR, "ArticleSpider"))ITEM_PIPELINES = { # 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, # "scrapy.pipelines.images.ImagesPipeline": 1, # scrapy自带的图片下载组件 # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, # 自定义保存到json文件 # 'ArticleSpider.pipelines.JsonItemExporterPipeline': 2, # 使用scrapy自带的保存到json文件组件 # 'ArticleSpider.pipelines.MysqlPipeline': 2, # 同步保存模式 'ArticleSpider.pipelines.MysqlTwistedPipeline': 2, # 异步保存模式 'ArticleSpider.pipelines.ArticleImagePipeline': 1, # 自定义图片下载组件}# 图片下载IMAGES_URLS_FIELD = "front_image_url"project_dir = os.path.abspath(os.path.dirname(__file__))IMAGES_STORE = os.path.join(project_dir, 'images')DOWNLOAD_FAIL_ON_DATALOSS = False
启动程序
#!/usr/bin/env python3# -*- coding: utf-8 -*-from scrapy.cmdline import executeimport sys, ossys.path.append(os.path.dirname(os.path.abspath(__file__)))# print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))# execute(["scrapy", "crawl", "jobbole"])execute(["scrapy", "crawl", "lagou"])
转载地址:http://duksi.baihongyu.com/