From 0dff1a72c00ae75cdec7c39ebeee5b49e0a520d5 Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 20 Apr 2022 12:48:38 +0800 Subject: [PATCH 1/3] clean up --- WebCrawler/gcolle.py | 15 +++++++-------- requirements.txt | 1 - 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 7330cdd..9a7dbd3 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -4,7 +4,6 @@ sys.path.append('../') from WebCrawler.crawler import * from ADC_function import * from lxml import etree -from requests_html import HTMLSession def main(number): config_file = config.getInstance() @@ -12,7 +11,6 @@ def main(number): number = number.upper().replace('GCOLLE-','') session = get_html_session() - htmlcode = get_html_session('https://gcolle.net/product_info.php/products_id/' + number) htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text html = etree.HTML(htmlcode) # R18 countinue @@ -66,9 +64,10 @@ def main(number): return dic if __name__ == '__main__': - main('840724') - main('840386') - main('838671') - main('814179') - main('834255') - main('814179') + from pprint import pprint + pprint(main('840724')) + pprint(main('840386')) + pprint(main('838671')) + pprint(main('814179')) + pprint(main('834255')) + pprint(main('814179')) diff --git a/requirements.txt b/requirements.txt index 333a750..7b63d8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ certifi==2020.12.5 MechanicalSoup==1.1.0 opencc-python-reimplemented face_recognition -requests_html \ No newline at end of file From f7186aa347f01301949a9b0abfa0a912e584417c Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 20 Apr 2022 13:03:17 +0800 Subject: [PATCH 2/3] gcolle.py:Add try block --- WebCrawler/gcolle.py | 103 ++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 9a7dbd3..6c9691b 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -6,65 +6,68 @@ from ADC_function import * from lxml import etree def main(number): - config_file = config.getInstance() + try: + number = number.upper().replace('GCOLLE-','') + session = get_html_session() - number = number.upper().replace('GCOLLE-','') - session = get_html_session() + htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text + html = etree.HTML(htmlcode) + # R18 countinue + htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text + gcolle_crawler = Crawler(htmlcode) - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - html = etree.HTML(htmlcode) - # R18 countinue - htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text - gcolle_crawler = Crawler(htmlcode) + number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') + if number != number_html: + raise Exception('[-]gcolle.py: number not match') - number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') - if number != number_html: - if config_file.debug(): - print('[!]gcolle.py: number not match') - return {'title':''} + # get extrafanart url + if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') + else: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') + # Add "https:" in each extrafanart url + for i in range(len(extrafanart)): + extrafanart[i] = 'https:' + extrafanart[i] - # get extrafanart url - if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') - else: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') - # Add "https:" in each extrafanart url - for i in range(len(extrafanart)): - extrafanart[i] = 'https:' + extrafanart[i] + dic = { + "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), + "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), + "runtime": '', + "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "number": "GCOLLE-" + str(number_html), + "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "trailer": '', + "actor_photo":'', + "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 + "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), + "extrafanart":extrafanart, + "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "website": 'https://gcolle.net/product_info.php/products_id/' + number, + "source": 'gcolle.py', + "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + '无码': False, + } + # for k,v in dic.items(): + # if k == 'outline': + # print(k,len(v)) + # else: + # print(k,v) + # print('===============================================================') + except Exception as e: + dic = {'title':''} + if config.getInstance().debug(): + print(e) - dic = { - "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), - "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), - "runtime": '', - "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "number": "GCOLLE-" + str(number_html), - "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "trailer": '', - "actor_photo":'', - "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 - "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), - "extrafanart":extrafanart, - "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "website": 'https://gcolle.net/product_info.php/products_id/' + number, - "source": 'gcolle.py', - "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - '无码': False, - } - # for k,v in dic.items(): - # if k == 'outline': - # print(k,len(v)) - # else: - # print(k,v) - # print('===============================================================') return dic if __name__ == '__main__': from pprint import pprint + config.getInstance().set_override("debug_mode:switch=1") pprint(main('840724')) pprint(main('840386')) pprint(main('838671')) From 95464f29ba7719e98c585a07281ba1c545edb448 Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 20 Apr 2022 13:50:15 +0800 Subject: [PATCH 3/3] =?UTF-8?q?gcolle.py:=E8=87=AA=E5=8A=A8=E7=BB=B4?= =?UTF-8?q?=E6=8C=81=E5=AF=B9=E8=AF=9D=EF=BC=8C=E5=86=8D=E6=AC=A1=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E6=97=B6=E5=8F=AA=E9=9C=80=E4=B8=80=E6=AC=A1http?= =?UTF-8?q?=E8=AF=B7=E6=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/gcolle.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 6c9691b..87d6bda 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -5,21 +5,33 @@ from WebCrawler.crawler import * from ADC_function import * from lxml import etree + def main(number): + save_cookies = False + cookie_filename = 'gcolle.json' try: + gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename) + session = get_html_session(cookies=gcolle_cooikes) number = number.upper().replace('GCOLLE-','') - session = get_html_session() htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - html = etree.HTML(htmlcode) - # R18 countinue - htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text gcolle_crawler = Crawler(htmlcode) + r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href') + if r18_continue and r18_continue.startswith('http'): + htmlcode = session.get(r18_continue).text + gcolle_crawler = Crawler(htmlcode) + save_cookies = True + cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True) number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') if number != number_html: raise Exception('[-]gcolle.py: number not match') + if save_cookies: + cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}" + cookies_save.parent.mkdir(parents=True, exist_ok=True) + cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + # get extrafanart url if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')