diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index 7330cdd..87d6bda 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -4,71 +4,85 @@ sys.path.append('../') from WebCrawler.crawler import * from ADC_function import * from lxml import etree -from requests_html import HTMLSession + def main(number): - config_file = config.getInstance() + save_cookies = False + cookie_filename = 'gcolle.json' + try: + gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename) + session = get_html_session(cookies=gcolle_cooikes) + number = number.upper().replace('GCOLLE-','') - number = number.upper().replace('GCOLLE-','') - session = get_html_session() + htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text + gcolle_crawler = Crawler(htmlcode) + r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href') + if r18_continue and r18_continue.startswith('http'): + htmlcode = session.get(r18_continue).text + gcolle_crawler = Crawler(htmlcode) + save_cookies = True + cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True) - htmlcode = get_html_session('https://gcolle.net/product_info.php/products_id/' + number) - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - html = etree.HTML(htmlcode) - # R18 countinue - htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text - gcolle_crawler = Crawler(htmlcode) + number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') + if number != number_html: + raise Exception('[-]gcolle.py: number not match') - number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') - if number != number_html: - if config_file.debug(): - print('[!]gcolle.py: number not match') - return {'title':''} + if save_cookies: + cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}" + cookies_save.parent.mkdir(parents=True, exist_ok=True) + cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') - # get extrafanart url - if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') - else: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') - # Add "https:" in each extrafanart url - for i in range(len(extrafanart)): - extrafanart[i] = 'https:' + extrafanart[i] + # get extrafanart url + if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') + else: + extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') + # Add "https:" in each extrafanart url + for i in range(len(extrafanart)): + extrafanart[i] = 'https:' + extrafanart[i] + + dic = { + "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), + "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), + "runtime": '', + "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], + "number": "GCOLLE-" + str(number_html), + "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), + "trailer": '', + "actor_photo":'', + "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 + "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), + "extrafanart":extrafanart, + "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + "website": 'https://gcolle.net/product_info.php/products_id/' + number, + "source": 'gcolle.py', + "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), + '无码': False, + } + # for k,v in dic.items(): + # if k == 'outline': + # print(k,len(v)) + # else: + # print(k,v) + # print('===============================================================') + except Exception as e: + dic = {'title':''} + if config.getInstance().debug(): + print(e) - dic = { - "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'), - "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), - "runtime": '', - "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "number": "GCOLLE-" + str(number_html), - "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "trailer": '', - "actor_photo":'', - "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 - "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), - "extrafanart":extrafanart, - "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "website": 'https://gcolle.net/product_info.php/products_id/' + number, - "source": 'gcolle.py', - "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - '无码': False, - } - # for k,v in dic.items(): - # if k == 'outline': - # print(k,len(v)) - # else: - # print(k,v) - # print('===============================================================') return dic if __name__ == '__main__': - main('840724') - main('840386') - main('838671') - main('814179') - main('834255') - main('814179') + from pprint import pprint + config.getInstance().set_override("debug_mode:switch=1") + pprint(main('840724')) + pprint(main('840386')) + pprint(main('838671')) + pprint(main('814179')) + pprint(main('834255')) + pprint(main('814179')) diff --git a/requirements.txt b/requirements.txt index 333a750..7b63d8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ certifi==2020.12.5 MechanicalSoup==1.1.0 opencc-python-reimplemented face_recognition -requests_html \ No newline at end of file