python - Scrapy Linkextractor -
i'm new programming. have tried everything.
i have manage scrape in 1 page when try entire site get:
[scrapy.extensions.logstats] info: crawled 0 pages (at 0 pates/min)
here spider:
import urlparse scrapy.http import request scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors import linkextractor class myspider(crawlspider): name = "municipio" allowed_domains = ["cm-leiria.pt"] start_urls = ["http://www.cm-leiria.pt/pages/"] rules = (rule (linkextractor(allow=("/pages/\d",),restrict_xpaths= ('//ul//li//a[@class="deph\d"]',)), callback="parse_items", follow=true), ) def parse_items(self, response): base_url = 'http://www.cm-leiria.pt/pages/215' in response.xpath('//a[starts-with(@href, "/uploads/") , not(contains(@href,":"))]/@href'): link = a.extract() if link.endswith('.pdf'): link = urlparse.urljoin(base_url, link) yield request(link, callback=self.save_pdf) def save_pdf(self, response): path = response.url.split('/')[-1] open(path, 'wb') f: f.write(response.body)
Comments
Post a Comment