Back

Explore Courses Blog Tutorials Interview Questions
0 votes
2 views
in Data Science by (18.4k points)

I am trying to create a spider to crawl multiple pages by using a date range.

The below code works fine but I am hoping to create a loop so I can just input 2 dates and crawl every date in between the 2 instead of having to enter every date manually. I would like to crawl every date since 2018-01-01. The URL is the exact same except for the date.

class Example(CrawlSpider):

    name = 'Example'

    allowed_domains = ['example.com.au']

    start_urls = ['https://www.example.com.au/example2/2020-06-18'

I am trying the below but am getting an error:

raise ValueError('Missing scheme in request url: %s' % self._url)

ValueError: Missing scheme in request url: h

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from scrapy.selector import Selector

from datetime import timedelta, date

class example(CrawlSpider):

    name = 'example'

    allowed_domains = ['example.com.au']

    

    def daterange(start_date, end_date):

        for n in range(int((end_date - start_date).days)):

            yield start_date + timedelta(n)

    start_date = date(2020, 6, 26)

    end_date = date(2020, 7, 2)

    start_urls = 'https://www.example.com.au/example2/'

    for single_date in daterange(start_date, end_date):

        print(single_date.strftime(start_urls+"%Y-%m-%d"))

    

    rules = (

     Rule(LinkExtractor(restrict_xpaths="//td[@class='example__example']/a"), 

     callback='parse_item', follow=True),

    )

    def parse_item(self, response):

1 Answer

0 votes
by (36.8k points)

Use timedelta and date.

from datetime import timedelta, date

def daterange(start_date, end_date):

    for n in range(int((end_date - start_date).days)):

        yield start_date + timedelta(n)

start_date = date(2020, 6, 18)

end_date = date(2020, 7, 2)

start_urls = []

start_url='https://www.example.com.au/example2/'

for single_date in daterange(start_date, end_date):

    start_urls.append(single_date.strftime(start_url+"%Y-%m-%d"))

 If you want to know more about the Data Science then do check out the following Data Science which will help you in understanding Data Science from scratch

Browse Categories

...