From 95464f29ba7719e98c585a07281ba1c545edb448 Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 20 Apr 2022 13:50:15 +0800 Subject: [PATCH] =?UTF-8?q?gcolle.py:=E8=87=AA=E5=8A=A8=E7=BB=B4=E6=8C=81?= =?UTF-8?q?=E5=AF=B9=E8=AF=9D=EF=BC=8C=E5=86=8D=E6=AC=A1=E8=B0=83=E7=94=A8?= =?UTF-8?q?=E6=97=B6=E5=8F=AA=E9=9C=80=E4=B8=80=E6=AC=A1http=E8=AF=B7?= =?UTF-8?q?=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/gcolle.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 6c9691b..87d6bda 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -5,21 +5,33 @@ from WebCrawler.crawler import * from ADC_function import * from lxml import etree + def main(number): + save_cookies = False + cookie_filename = 'gcolle.json' try: + gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename) + session = get_html_session(cookies=gcolle_cooikes) number = number.upper().replace('GCOLLE-','') - session = get_html_session() htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - html = etree.HTML(htmlcode) - # R18 countinue - htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text gcolle_crawler = Crawler(htmlcode) + r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href') + if r18_continue and r18_continue.startswith('http'): + htmlcode = session.get(r18_continue).text + gcolle_crawler = Crawler(htmlcode) + save_cookies = True + cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True) number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') if number != number_html: raise Exception('[-]gcolle.py: number not match') + if save_cookies: + cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}" + cookies_save.parent.mkdir(parents=True, exist_ok=True) + cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + # get extrafanart url if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')