Skip to content

Commit fb9cc07

Browse files
committed
Added pipelines
1 parent 0a29b99 commit fb9cc07

File tree

2 files changed

+31
-9
lines changed

2 files changed

+31
-9
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from scrapy.contrib.linkextractors import LinkExtractor
2+
from scrapy.contrib.spiders import CrawlSpider, Rule
3+
from wikiSpider.items import Article
4+
5+
class ArticleSpider(CrawlSpider):
6+
name = 'articlePipelines'
7+
allowed_domains = ['wikipedia.org']
8+
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
9+
rules = [
10+
Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True),
11+
]
12+
13+
def parse_items(self, response):
14+
article = Article()
15+
article['url'] = response.url
16+
article['title'] = response.css('h1::text').extract_first()
17+
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
18+
article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first()
19+
return article
Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1-
# -*- coding: utf-8 -*-
2-
3-
# Define your item pipelines here
4-
#
5-
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6-
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7-
1+
from datetime import datetime
2+
from wikiSpider.items import Article
3+
from string import whitespace
84

95
class WikispiderPipeline(object):
10-
def process_item(self, item, spider):
11-
return item
6+
def process_item(self, article, spider):
7+
dateStr = article['lastUpdated']
8+
# This page was last edited on 26 January 2018, at 03:56.
9+
article['lastUpdated'] = article['lastUpdated'].replace('This page was last edited on', '')
10+
article['lastUpdated'] = article['lastUpdated'].strip()
11+
article['lastUpdated'] = datetime.strptime(article['lastUpdated'], '%d %B %Y, at %H:%M.')
12+
article['text'] = [line for line in article['text'] if line not in whitespace]
13+
article['text'] = ''.join(article['text'])
14+
return article

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy