-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add farsi blogs * modify crawler README file * modify #2 crawler README file * add strip function to hamshahri_spider
- Loading branch information
Showing
8 changed files
with
115 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,18 @@ | ||
|
||
simple crawler using scrapy framework . scrapy have useful features like : avoiding duplicate urls, limitation on depth , defining request rate and ... . | ||
simple crawler using scrapy framework . scrapy have useful features like : avoiding duplicate urls, limitation on depth , defining request rate and ... . \n | ||
you need to install scrapy : | ||
`pip install scrapy` | ||
and run the crawler : | ||
`scrapy crawl hamshahri -o ham.json` | ||
and run the crawlers : <br /> | ||
|
||
settings can be found in :'crawler/settings.py' and 'crawler/spiders/hamshahri_spider.py' | ||
`scrapy crawl hamshahri -o hamshahri.json` <br /> | ||
`scrapy crawl blog -o blog.json` <br /> | ||
`scrapy crawl blogfa -o blogfa.json` <br /> | ||
`scrapy crawl blogsky -o blogsky.json` <br /> | ||
`scrapy crawl dorsablog -o dorsablog.json` <br /> | ||
`scrapy crawl mihanblog -o mihanblog.json` <br /> | ||
`scrapy crawl persianblog -o persianblog.json` <br /> | ||
|
||
settings can be found in :'crawler/settings.py' <br /> | ||
spiders are available in 'crawler/spiders' | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "blog" | ||
start_urls = [ | ||
'http://blog.ir/topblogs/96' | ||
] | ||
allowed_domains=["blog.ir"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "blogfa" | ||
start_urls = [ | ||
'https://blogfa.com/members/' | ||
] | ||
allowed_domains=["blogfa.com"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "blogsky" | ||
start_urls = [ | ||
'http://www.blogsky.com/posts' | ||
] | ||
allowed_domains=["blogsky.com"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "dorsablog" | ||
start_urls = [ | ||
'https://dorsablog.com/update' | ||
] | ||
allowed_domains=["dorsablog.com"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "mihanblog" | ||
start_urls = [ | ||
'http://mihanblog.com/' | ||
] | ||
allowed_domains=["mihanblog.com"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import scrapy | ||
|
||
|
||
class QuotesSpider(scrapy.Spider): | ||
name = "persianblog" | ||
start_urls = [ | ||
'https://persianblog.ir/' | ||
] | ||
allowed_domains=["persianblog.ir"] | ||
def parse(self, response): | ||
for quote in response.css('p::text').extract(): | ||
yield quote | ||
yield { | ||
'text': quote.strip() | ||
} | ||
for href in response.css('a::attr(href)').extract(): | ||
yield response.follow(href, callback=self.parse) |