Форум сайта python.su
Добрый день.
Пытаюсь запустить скрипт, который, по задумке, должен пройти авторизацию и спарсить материалы с сайта.
Я такие скрипты (а с авторизацией) никогда не делал, и слабо понимаю механизм. Воспользовался примером с https://doc.scrapy.org/en/latest/topics/logging.html
Но что-то пошло не так:
#! coding: utf-8 __author__ = 'iam' from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from scrapy.item import Item, Field from scrapy.contrib.loader import XPathItemLoader from scrapy.contrib.loader.processor import TakeFirst from scrapy.http import Request, FormRequest from scrapy import Selector # class ScrapyTestItem(Item): title = Field() url = Field() class Test03Loader(XPathItemLoader): default_output_processor = TakeFirst() class ScrapyTestSpider(CrawlSpider): name = "cr01" allowed_domains = ["ecom.elko.ru"] start_urls = ["https://ecom.elko.ru/Account/Login", "https://ecom.elko.ru/Catalog/Category/SCO" ] rules = ( Rule(LinkExtractor( allow=('https://ecom.elko.ru/Catalog/Product/')), callback='parse_item', follow=False), ) def parse(self, response): return [FormRequest.from_response(response, formdata={'username': 'tiscom6', 'password': '6307860'}, callback=self.after_login)] def after_login(self, response): # check login succeed before going on if "authentication failed" in response.body: self.log("Login failed", level=log.ERROR) return def parse_item(self, response): hxs = HtmlXPathSelector(response) l = Test03Loader(ScrapyTestItem(), hxs) l.add_xpath('title', "//h1/text()") l.add_value('url', response.url) return l.load_item()
2016-12-17 23:46:21 [scrapy] INFO: Spider opened 2016-12-17 23:46:21 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i tems (at 0 items/min) 2016-12-17 23:46:21 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 2016-12-17 23:46:21 [scrapy] DEBUG: Redirecting (302) to <GET https://ecom.elko. ru/Account/Login?ReturnUrl=%2fCatalog%2fCategory%2fSCO> from <GET https://ecom.e lko.ru/Catalog/Category/SCO> 2016-12-17 23:46:21 [scrapy] DEBUG: Crawled (200) <GET https://ecom.elko.ru/Acco unt/Login> (referer: None) 2016-12-17 23:46:21 [scrapy] DEBUG: Crawled (200) <GET https://ecom.elko.ru/Acco unt/Login?ReturnUrl=%2fCatalog%2fCategory%2fSCO> (referer: None) 2016-12-17 23:46:21 [scrapy] DEBUG: Crawled (200) <POST https://ecom.elko.ru/Acc ount/Login> (referer: https://ecom.elko.ru/Account/Login) 2016-12-17 23:46:22 [scrapy] DEBUG: Crawled (200) <POST https://ecom.elko.ru/Acc ount/Login?ReturnUrl=%2fCatalog%2fCategory%2fSCO> (referer: https://ecom.elko.ru /Account/Login?ReturnUrl=%2fCatalog%2fCategory%2fSCO) 2016-12-17 23:46:22 [scrapy] INFO: Closing spider (finished) 2016-12-17 23:46:22 [scrapy] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 2365, 'downloader/request_count': 5, 'downloader/request_method_count/GET': 3, 'downloader/request_method_count/POST': 2, 'downloader/response_bytes': 19527, 'downloader/response_count': 5, 'downloader/response_status_count/200': 4, 'downloader/response_status_count/302': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2016, 12, 17, 20, 46, 22, 105000), 'log_count/DEBUG': 6, 'log_count/INFO': 7, 'request_depth_max': 1, 'response_received_count': 4, 'scheduler/dequeued': 5, 'scheduler/dequeued/memory': 5, 'scheduler/enqueued': 5, 'scheduler/enqueued/memory': 5, 'start_time': datetime.datetime(2016, 12, 17, 20, 46, 21, 433000)} 2016-12-17 23:46:22 [scrapy] INFO: Spider closed (finished)
Офлайн