From f7186aa347f01301949a9b0abfa0a912e584417c Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 20 Apr 2022 13:03:17 +0800 Subject: [PATCH] gcolle.py:Add try block --- WebCrawler/gcolle.py | 103 ++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 9a7dbd3..6c9691b 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -6,65 +6,68 @@ from ADC_function import * from lxml import etree def main(number): - config_file = config.getInstance() + try: + number = number.upper().replace('GCOLLE-','') + session = get_html_session() - number = number.upper().replace('GCOLLE-','') - session = get_html_session() + htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text + html = etree.HTML(htmlcode) + # R18 countinue + htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text + gcolle_crawler = Crawler(htmlcode) - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - html = etree.HTML(htmlcode) - # R18 countinue - htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text - gcolle_crawler = Crawler(htmlcode) + number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') + if number != number_html: + raise Exception('[-]gcolle.py: number not match') - number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') - if number != number_html: - if config_file.debug(): - print('[!]gcolle.py: number not match') - return {'title':''} + # get extrafanart url + if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') + else: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') + # Add "https:" in each extrafanart url + for i in range(len(extrafanart)): + extrafanart[i] = 'https:' + extrafanart[i] - # get extrafanart url - if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') - else: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') - # Add "https:" in each extrafanart url - for i in range(len(extrafanart)): - extrafanart[i] = 'https:' + extrafanart[i] + dic = { + "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), + "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), + "runtime": '', + "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "number": "GCOLLE-" + str(number_html), + "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "trailer": '', + "actor_photo":'', + "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 + "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), + "extrafanart":extrafanart, + "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "website": 'https://gcolle.net/product_info.php/products_id/' + number, + "source": 'gcolle.py', + "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + '无码': False, + } + # for k,v in dic.items(): + # if k == 'outline': + # print(k,len(v)) + # else: + # print(k,v) + # print('===============================================================') + except Exception as e: + dic = {'title':''} + if config.getInstance().debug(): + print(e) - dic = { - "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), - "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), - "runtime": '', - "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "number": "GCOLLE-" + str(number_html), - "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "trailer": '', - "actor_photo":'', - "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 - "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), - "extrafanart":extrafanart, - "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "website": 'https://gcolle.net/product_info.php/products_id/' + number, - "source": 'gcolle.py', - "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - '无码': False, - } - # for k,v in dic.items(): - # if k == 'outline': - # print(k,len(v)) - # else: - # print(k,v) - # print('===============================================================') return dic if __name__ == '__main__': from pprint import pprint + config.getInstance().set_override("debug_mode:switch=1") pprint(main('840724')) pprint(main('840386')) pprint(main('838671'))