From 72a9790858faa67c0f8506cc9f0da244eef91cec Mon Sep 17 00:00:00 2001 From: root Date: Wed, 12 Aug 2020 18:24:46 +0800 Subject: [PATCH] Update Pre-release 3.7 --- AV_Data_Capture.py | 4 +- WebCrawler/avsox.py | 124 ++++++++++++++++ WebCrawler/dlsite.py | 131 ++++++++++++++++ WebCrawler/fanza.py | 297 +++++++++++++++++++++++++++++++++++++ WebCrawler/fc2fans_club.py | 165 +++++++++++++++++++++ WebCrawler/jav321.py | 156 +++++++++++++++++++ WebCrawler/javbus.py | 167 +++++++++++++++++++++ WebCrawler/javdb.py | 154 +++++++++++++++++++ WebCrawler/javlib.py | 110 ++++++++++++++ WebCrawler/mgstage.py | 120 +++++++++++++++ WebCrawler/xcity.py | 192 ++++++++++++++++++++++++ core.py | 5 + 12 files changed, 1623 insertions(+), 2 deletions(-) create mode 100644 WebCrawler/avsox.py create mode 100644 WebCrawler/dlsite.py create mode 100644 WebCrawler/fanza.py create mode 100644 WebCrawler/fc2fans_club.py create mode 100644 WebCrawler/jav321.py create mode 100644 WebCrawler/javbus.py create mode 100644 WebCrawler/javdb.py create mode 100644 WebCrawler/javlib.py create mode 100644 WebCrawler/mgstage.py create mode 100644 WebCrawler/xcity.py diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 17e3b27..8d9d793 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -33,7 +33,7 @@ def movie_lists(root, escape_folder): if folder in root: return [] total = [] - file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', ] + file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', '.iso','.ISO'] dirs = os.listdir(root) for entry in dirs: f = os.path.join(root, entry) @@ -110,7 +110,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu if __name__ == '__main__': - version = '3.6' + version = '3.7' # Parse command line args single_file_path, config_file, auto_exit, custom_number = argparse_function() diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py new file mode 100644 index 0000000..c3d0b6a --- /dev/null +++ b/WebCrawler/avsox.py @@ -0,0 +1,124 @@ +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'avatar-box'}) + d = {} + for i in a: + l = i.img['src'] + t = i.span.get_text() + p2 = {t: l} + d.update(p2) + return d +def getTitle(a): + try: + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] + return result.replace('/', '') + except: + return '' +def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + soup = BeautifulSoup(a, 'lxml') + a = soup.find_all(attrs={'class': 'avatar-box'}) + d = [] + for i in a: + d.append(i.span.get_text()) + return d +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') + return result1 +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") + return result1 +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") + return result1 +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") + return result1 +def getYear(release): + try: + result = str(re.search('\d{4}',release).group()) + return result + except: + return release +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") + return result1 +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") + return result +def getCover_small(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") + return result +def getTag(a): # 获取演员 + soup = BeautifulSoup(a, 'lxml') + a = soup.find_all(attrs={'class': 'genre'}) + d = [] + for i in a: + d.append(i.get_text()) + return d +def getSeries(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") + return result1 + except: + return '' + +def main(number): + a = get_html('https://avsox.host/cn/search/' + number) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + if result1 == '' or result1 == 'null' or result1 == 'None': + a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) + print(a) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + if result1 == '' or result1 == 'null' or result1 == 'None': + a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) + print(a) + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") + web = get_html(result1) + soup = BeautifulSoup(web, 'lxml') + info = str(soup.find(attrs={'class': 'row movie'})) + dic = { + 'actor': getActor(web), + 'title': getTitle(web).strip(getNum(web)), + 'studio': getStudio(info), + 'outline': '',# + 'runtime': getRuntime(info), + 'director': '', # + 'release': getRelease(info), + 'number': getNum(info), + 'cover': getCover(web), + 'cover_small': getCover_small(a), + 'imagecut': 3, + 'tag': getTag(web), + 'label': getLabel(info), + 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': getActorPhoto(web), + 'website': result1, + 'source': 'avsox.py', + 'series': getSeries(info), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + +if __name__ == "__main__": + print(main('012717_472')) \ No newline at end of file diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py new file mode 100644 index 0000000..f4d1501 --- /dev/null +++ b/WebCrawler/dlsite.py @@ -0,0 +1,131 @@ +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) +#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html')) +#title //*[@id="work_name"]/a/text() +#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text() +#release //th[contains(text(),"販売日")]/../td/a/text() +#story //th[contains(text(),"シナリオ")]/../td/a/text() +#senyo //th[contains(text(),"声優")]/../td/a/text() +#tag //th[contains(text(),"ジャンル")]/../td/div/a/text() +#jianjie //*[@id="main_inner"]/div[3]/text() +#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src + +#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html + +def getTitle(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = html.xpath('//*[@id="work_name"]/a/text()')[0] + return result +def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') + return result1 +def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img + a = actor.split(',') + d={} + for i in a: + p={i:''} + d.update(p) + return d +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] + return result +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').rstrip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] + return result +def getYear(getRelease): + try: + result = str(re.search('\d{4}', getRelease).group()) + return result + except: + return getRelease +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0] + return result1.replace('年','-').replace('月','-').replace('日','') +def getTag(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()') + return result + except: + return '' + +def getCover_small(a, index=0): + # same issue mentioned below, + # javdb sometime returns multiple results + # DO NOT just get the firt one, get the one with correct index number + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + if not 'https' in result: + result = 'https:' + result + return result + except: # 2020.7.17 Repair Cover Url crawl + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0] + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0] + return result +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + total = [] + result = html.xpath('//*[@id="main_inner"]/div[3]/text()') + for i in result: + total.append(i.strip('\r\n')) + return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") +def getSeries(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') + return result1 +def main(number): + number = number.upper() + htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html') + + dic = { + 'actor': getActor(htmlcode), + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'outline': getOutline(htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'release': getRelease(htmlcode), + 'number': number, + 'cover': 'https:' + getCover(htmlcode), + 'cover_small': '', + 'imagecut': 0, + 'tag': getTag(htmlcode), + 'label': getLabel(htmlcode), + 'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': '', + 'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', + 'source': 'dlsite.py', + 'series': getSeries(htmlcode), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + +# main('DV-1562') +# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") +if __name__ == "__main__": + print(main('VJ013479')) diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py new file mode 100644 index 0000000..71aab6a --- /dev/null +++ b/WebCrawler/fanza.py @@ -0,0 +1,297 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import json +import re +from urllib.parse import urlencode + +from lxml import etree + +from ADC_function import * + +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + + +def getTitle(text): + html = etree.fromstring(text, etree.HTMLParser()) + result = html.xpath('//*[starts-with(@id, "title")]/text()')[0] + return result + + +def getActor(text): + # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(text, etree.HTMLParser()) + result = ( + str( + html.xpath( + "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" + ) + ) + .strip(" ['']") + .replace("', '", ",") + ) + return result + + +def getStudio(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'メーカー')]/following-sibling::td/text()" + )[0] + return result + + +def getRuntime(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] + return re.search(r"\d+", str(result)).group() + + +def getLabel(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'レーベル:')]/following-sibling::td/text()" + )[0] + return result + + +def getNum(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'品番:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'品番:')]/following-sibling::td/text()" + )[0] + return result + + +def getYear(getRelease): + try: + result = str(re.search(r"\d{4}", getRelease).group()) + return result + except: + return getRelease + + +def getRelease(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" + )[0].lstrip("\n") + except: + try: + result = html.xpath( + "//td[contains(text(),'発売日:')]/following-sibling::td/text()" + )[0].lstrip("\n") + except: + result = "----" + if result == "----": + try: + result = html.xpath( + "//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()" + )[0].lstrip("\n") + except: + try: + result = html.xpath( + "//td[contains(text(),'配信開始日:')]/following-sibling::td/text()" + )[0].lstrip("\n") + except: + pass + return result.replace("/", "-") + + +def getTag(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" + ) + except: + result = html.xpath( + "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" + ) + return result + + +def getCover(text, number): + html = etree.fromstring(text, etree.HTMLParser()) + cover_number = number + try: + result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # sometimes fanza modify _ to \u0005f for image id + if "_" in cover_number: + cover_number = cover_number.replace("_", r"\u005f") + try: + result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] + except: + # (TODO) handle more edge case + # print(html) + # raise exception here, same behavior as before + # people's major requirement is fetching the picture + raise ValueError("can not find image") + return result + + +def getDirector(text): + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'監督:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'監督:')]/following-sibling::td/text()" + )[0] + return result + + +def getOutline(text): + html = etree.fromstring(text, etree.HTMLParser()) + try: + result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( + "\n", "" + ) + if result == "": + result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( + "\n", "" + ) + except: + # (TODO) handle more edge case + # print(html) + return "" + return result + + +def getSeries(text): + try: + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" + )[0] + return result + except: + return "" + + +def main(number): + # fanza allow letter + number + underscore, normalize the input here + # @note: I only find the usage of underscore as h_test123456789 + fanza_search_number = number + # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix + if fanza_search_number.startswith("h-"): + fanza_search_number = fanza_search_number.replace("h-", "h_") + + fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() + + fanza_urls = [ + "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", + "https://www.dmm.co.jp/rental/-/detail/=/cid=", + ] + chosen_url = "" + + for url in fanza_urls: + chosen_url = url + fanza_search_number + htmlcode = get_html( + "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( + urlencode({"rurl": chosen_url}) + ) + ) + if "404 Not Found" not in htmlcode: + break + if "404 Not Found" in htmlcode: + return json.dumps({"title": "",}) + try: + # for some old page, the input number does not match the page + # for example, the url will be cid=test012 + # but the hinban on the page is test00012 + # so get the hinban first, and then pass it to following functions + fanza_hinban = getNum(htmlcode) + data = { + "title": getTitle(htmlcode).strip(), + "studio": getStudio(htmlcode), + "outline": getOutline(htmlcode), + "runtime": getRuntime(htmlcode), + "director": getDirector(htmlcode) if "anime" not in chosen_url else "", + "actor": getActor(htmlcode) if "anime" not in chosen_url else "", + "release": getRelease(htmlcode), + "number": fanza_hinban, + "cover": getCover(htmlcode, fanza_hinban), + "imagecut": 1, + "tag": getTag(htmlcode), + "label": getLabel(htmlcode), + "year": getYear( + getRelease(htmlcode) + ), # str(re.search('\d{4}',getRelease(a)).group()), + "actor_photo": "", + "website": chosen_url, + "source": "fanza.py", + "series": getSeries(htmlcode), + } + except: + data = { + "title": "", + } + js = json.dumps( + data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") + ) # .encode('UTF-8') + return js + + +def main_htmlcode(number): + # fanza allow letter + number + underscore, normalize the input here + # @note: I only find the usage of underscore as h_test123456789 + fanza_search_number = number + # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix + if fanza_search_number.startswith("h-"): + fanza_search_number = fanza_search_number.replace("h-", "h_") + + fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() + + fanza_urls = [ + "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", + ] + chosen_url = "" + for url in fanza_urls: + chosen_url = url + fanza_search_number + htmlcode = get_html(chosen_url) + if "404 Not Found" not in htmlcode: + break + if "404 Not Found" in htmlcode: + return json.dumps({"title": "",}) + return htmlcode + + +if __name__ == "__main__": + print(main("DV-1562")) + print(main("96fad1217")) diff --git a/WebCrawler/fc2fans_club.py b/WebCrawler/fc2fans_club.py new file mode 100644 index 0000000..2c31a51 --- /dev/null +++ b/WebCrawler/fc2fans_club.py @@ -0,0 +1,165 @@ +import re +from lxml import etree#need install +import json +import ADC_function +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(htmlcode): #获取厂商 + #print(htmlcode) + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") + result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) + #print(result2) + return result2 +def getActor(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") + return result + except: + return '' +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") + return result +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + #print(result) + return result +def getRelease(htmlcode2): # + #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html=etree.fromstring(htmlcode2,etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") + return result +def getCover(htmlcode,number,htmlcode2): #获取厂商 # + #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") + if result == '': + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") + return 'https://fc2club.com' + result2 + return 'http:' + result +def getOutline(htmlcode2): #获取番号 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') + return result +def getTag(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) + return result.strip(" ['']").replace("'",'').replace(' ','') +def getYear(release): + try: + result = re.search('\d{4}',release).group() + return result + except: + return '' + +def getTitle_fc2com(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] + return result +def getActor_fc2com(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] + return result + except: + return '' +def getStudio_fc2com(htmlcode): #获取厂商 + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") + return result + except: + return '' +def getNum_fc2com(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getRelease_fc2com(htmlcode2): # + html=etree.fromstring(htmlcode2,etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") + return result +def getCover_fc2com(htmlcode2): #获取厂商 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") + return 'http:' + result +def getOutline_fc2com(htmlcode2): #获取番号 # + html = etree.fromstring(htmlcode2, etree.HTMLParser()) + result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') + return result +def getTag_fc2com(number): #获取番号 + htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) + result = re.findall('"tag":"(.*?)"', htmlcode) + return result +def getYear_fc2com(release): + try: + result = re.search('\d{4}',release).group() + return result + except: + return '' + +def main(number): + try: + number = number.replace('FC2-', '').replace('fc2-', '') + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') + htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') + actor = getActor(htmlcode) + if getActor(htmlcode) == '': + actor = 'FC2系列' + dic = { + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'year': '',#str(re.search('\d{4}',getRelease(number)).group()), + 'outline': '',#getOutline(htmlcode2), + 'runtime': getYear(getRelease(htmlcode)), + 'director': getStudio(htmlcode), + 'actor': actor, + 'release': getRelease(number), + 'number': 'FC2-'+number, + 'label': '', + 'cover': getCover(htmlcode,number,htmlcode2), + 'imagecut': 0, + 'tag': getTag(htmlcode), + 'actor_photo':'', + 'website': 'https://fc2club.com//html/FC2-' + number + '.html', + 'source':'https://fc2club.com//html/FC2-' + number + '.html', + 'series': '', + } + if dic['title'] == '': + htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) + actor = getActor(htmlcode) + if getActor(htmlcode) == '': + actor = 'FC2系列' + dic = { + 'title': getTitle_fc2com(htmlcode2), + 'studio': getStudio_fc2com(htmlcode2), + 'year': '', # str(re.search('\d{4}',getRelease(number)).group()), + 'outline': getOutline_fc2com(htmlcode2), + 'runtime': getYear_fc2com(getRelease(htmlcode2)), + 'director': getStudio_fc2com(htmlcode2), + 'actor': actor, + 'release': getRelease_fc2com(number), + 'number': 'FC2-' + number, + 'cover': getCover_fc2com(htmlcode2), + 'imagecut': 0, + 'tag': getTag_fc2com(number), + 'label': '', + 'actor_photo': '', + 'website': 'http://adult.contents.fc2.com/article/' + number + '/', + 'source': 'http://adult.contents.fc2.com/article/' + number + '/', + 'series': '', + } + except Exception as e: + # (TODO) better handle this + # print(e) + dic = {"title": ""} + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js + +if __name__ == '__main__': + print(main('1252953')) \ No newline at end of file diff --git a/WebCrawler/jav321.py b/WebCrawler/jav321.py new file mode 100644 index 0000000..7b0baae --- /dev/null +++ b/WebCrawler/jav321.py @@ -0,0 +1,156 @@ +import json +from bs4 import BeautifulSoup +from lxml import html +from ADC_function import post_html + + +def main(number: str) -> json: + result = post_html(url="https://www.jav321.com/search", query={"sn": number}) + soup = BeautifulSoup(result.text, "html.parser") + lx = html.fromstring(str(soup)) + + if "/video/" in result.url: + data = parse_info(soup) + dic = { + "title": get_title(lx), + "year": get_year(data), + "outline": get_outline(lx), + "director": "", + "cover": get_cover(lx), + "imagecut": 1, + "actor_photo": "", + "website": result.url, + "source": "jav321.py", + **data, + } + else: + dic = {} + + return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) + + +def get_title(lx: html.HtmlElement) -> str: + return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip() + + +def parse_info(soup: BeautifulSoup) -> dict: + data = soup.select_one("div.row > div.col-md-9") + + if data: + dd = str(data).split("
") + data_dic = {} + for d in dd: + data_dic[get_bold_text(h=d)] = d + + return { + "actor": get_actor(data_dic), + "label": get_label(data_dic), + "studio": get_studio(data_dic), + "tag": get_tag(data_dic), + "number": get_number(data_dic), + "release": get_release(data_dic), + "runtime": get_runtime(data_dic), + "series": get_series(data_dic), + } + else: + return {} + + +def get_bold_text(h: str) -> str: + soup = BeautifulSoup(h, "html.parser") + if soup.b: + return soup.b.text + else: + return "UNKNOWN_TAG" + + +def get_anchor_info(h: str) -> str: + result = [] + + data = BeautifulSoup(h, "html.parser").find_all("a", href=True) + for d in data: + result.append(d.text) + + return ",".join(result) + + +def get_text_info(h: str) -> str: + return h.split(": ")[1] + + +def get_cover(lx: html.HtmlElement) -> str: + return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0] + + +def get_outline(lx: html.HtmlElement) -> str: + return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] + +def get_series2(lx: html.HtmlElement) -> str: + return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0] + + +def get_actor(data: hash) -> str: + if "女优" in data: + return get_anchor_info(data["女优"]) + else: + return "" + + +def get_label(data: hash) -> str: + if "片商" in data: + return get_anchor_info(data["片商"]) + else: + return "" + + +def get_tag(data: hash) -> str: + if "标签" in data: + return get_anchor_info(data["标签"]) + else: + return "" + + +def get_studio(data: hash) -> str: + if "片商" in data: + return get_anchor_info(data["片商"]) + else: + return "" + + +def get_number(data: hash) -> str: + if "番号" in data: + return get_text_info(data["番号"]) + else: + return "" + + +def get_release(data: hash) -> str: + if "发行日期" in data: + return get_text_info(data["发行日期"]) + else: + return "" + + +def get_runtime(data: hash) -> str: + if "播放时长" in data: + return get_text_info(data["播放时长"]) + else: + return "" + + +def get_year(data: hash) -> str: + if "release" in data: + return data["release"][:4] + else: + return "" + + +def get_series(data: hash) -> str: + if "系列" in data: + return get_anchor_info(data["系列"]) + else: + return "" + + +if __name__ == "__main__": + print(main("soe-259")) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py new file mode 100644 index 0000000..7d51a4d --- /dev/null +++ b/WebCrawler/javbus.py @@ -0,0 +1,167 @@ +import re +from pyquery import PyQuery as pq#need install +from lxml import etree#need install +from bs4 import BeautifulSoup#need install +import json +from ADC_function import * +import fanza + +def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'star-name'}) + d={} + for i in a: + l=i.a['href'] + t=i.get_text() + html = etree.fromstring(get_html(l), etree.HTMLParser()) + p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") + p2={t:p} + d.update(p2) + return d +def getTitle(htmlcode): #获取标题 + doc = pq(htmlcode) + title=str(doc('div.container h3').text()).replace(' ','-') + try: + title2 = re.sub('n\d+-','',title) + return title2 + except: + return title +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + return result +def getYear(htmlcode): #获取年份 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + return result +def getCover(htmlcode): #获取封面链接 + doc = pq(htmlcode) + image = doc('a.bigImage') + return image.attr('href') +def getRelease(htmlcode): #获取出版日期 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + return result +def getRuntime(htmlcode): #获取分钟 + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find(text=re.compile('分鐘')) + return a +def getActor(htmlcode): #获取女优 + b=[] + soup=BeautifulSoup(htmlcode,'lxml') + a=soup.find_all(attrs={'class':'star-name'}) + for i in a: + b.append(i.get_text()) + return b +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getDirector(htmlcode): #获取导演 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + return result +def getCID(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + #print(htmlcode) + string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') + result = re.sub('/.*?.jpg','',string) + return result +def getOutline(htmlcode): #获取演员 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','') + return result + except: + return '' +def getSerise(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") + return result + except: + return '' +def getTag(htmlcode): # 获取演员 + tag = [] + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find_all(attrs={'class': 'genre'}) + for i in a: + if 'onmouseout' in str(i): + continue + tag.append(i.get_text()) + return tag + +def main_uncensored(number): + htmlcode = get_html('https://www.javbus.com/' + number) + if getTitle(htmlcode) == '': + htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) + try: + dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) + except: + dww_htmlcode = '' + dic = { + 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), + 'studio': getStudio(htmlcode), + 'year': getYear(htmlcode), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), + 'imagecut': 0, + 'actor_photo': '', + 'website': 'https://www.javbus.com/' + number, + 'source': 'javbus.py', + 'series': getSerise(htmlcode), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + + +def main(number): + try: + try: + htmlcode = get_html('https://www.javbus.com/' + number) + try: + dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) + except: + dww_htmlcode = '' + dic = { + 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), + 'studio': getStudio(htmlcode), + 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'imagecut': 1, + 'tag': getTag(htmlcode), + 'label': getSerise(htmlcode), + 'actor_photo': getActorPhoto(htmlcode), + 'website': 'https://www.javbus.com/' + number, + 'source': 'javbus.py', + 'series': getSerise(htmlcode), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, + separators=(',', ':'), ) # .encode('UTF-8') + return js + except: + return main_uncensored(number) + except: + data = { + "title": "", + } + js = json.dumps( + data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") + ) + return js + +if __name__ == "__main__" : + print(main('ipx-292')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py new file mode 100644 index 0000000..b1656d0 --- /dev/null +++ b/WebCrawler/javdb.py @@ -0,0 +1,154 @@ +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = html.xpath("/html/body/section/div/h2/strong/text()")[0] + return result +def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') +def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img + a = actor.split(',') + d={} + for i in a: + p={i:''} + d.update(p) + return d +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').rstrip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) + result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") + return str(result2 + result1).strip('+') +def getYear(getRelease): + try: + result = str(re.search('\d{4}', getRelease).group()) + return result + except: + return getRelease +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+') +def getTag(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') + return result + except: + result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') + return result + +def getCover_small(a, index=0): + # same issue mentioned below, + # javdb sometime returns multiple results + # DO NOT just get the firt one, get the one with correct index number + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + if not 'https' in result: + result = 'https:' + result + return result + except: # 2020.7.17 Repair Cover Url crawl + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] + except: # 2020.7.17 Repair Cover Url crawl + result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") + return result +def getSeries(a): + #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def main(number): + try: + number = number.upper() + try: + query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') + except: + query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all') + html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + # javdb sometime returns multiple results, + # and the first elememt maybe not the one we are looking for + # iterate all candidates and find the match one + urls = html.xpath('//*[@id="videos"]/div/div/a/@href') + ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') + correct_url = urls[ids.index(number)] + detail_page = get_html('https://javdb.com' + correct_url) + + # If gray image exists ,then replace with normal cover + cover_small = getCover_small(query_result, index=ids.index(number)) + if 'placeholder' in cover_small: + cover_small = getCover(detail_page) + + + dic = { + 'actor': getActor(detail_page), + 'title': getTitle(detail_page), + 'studio': getStudio(detail_page), + 'outline': getOutline(detail_page), + 'runtime': getRuntime(detail_page), + 'director': getDirector(detail_page), + 'release': getRelease(detail_page), + 'number': getNum(detail_page), + 'cover': getCover(detail_page), + 'cover_small': cover_small, + 'imagecut': 3, + 'tag': getTag(detail_page), + 'label': getLabel(detail_page), + 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': getActorPhoto(getActor(detail_page)), + 'website': 'https://javdb.com' + correct_url, + 'source': 'javdb.py', + 'series': getSeries(detail_page), + } + except Exception as e: + # print(e) + dic = {"title": ""} + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + +# main('DV-1562') +# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") +if __name__ == "__main__": + print(main('snyz-007')) diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py new file mode 100644 index 0000000..cb6f78b --- /dev/null +++ b/WebCrawler/javlib.py @@ -0,0 +1,110 @@ +import json +import bs4 +from bs4 import BeautifulSoup +from lxml import html +from http.cookies import SimpleCookie + +from ADC_function import get_javlib_cookie, get_html + + +def main(number: str): + raw_cookies, user_agent = get_javlib_cookie() + + # Blank cookies mean javlib site return error + if not raw_cookies: + return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) + + # Manually construct a dictionary + s_cookie = SimpleCookie() + s_cookie.load(raw_cookies) + cookies = {} + for key, morsel in s_cookie.items(): + cookies[key] = morsel.value + + # Scraping + result = get_html( + "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number), + cookies=cookies, + ua=user_agent, + return_type="object" + ) + soup = BeautifulSoup(result.text, "html.parser") + lx = html.fromstring(str(soup)) + + if "/?v=jav" in result.url: + dic = { + "title": get_title(lx, soup), + "studio": get_table_el_single_anchor(soup, "video_maker"), + "year": get_table_el_td(soup, "video_date")[:4], + "outline": "", + "director": get_table_el_single_anchor(soup, "video_director"), + "cover": get_cover(lx), + "imagecut": 1, + "actor_photo": "", + "website": result.url, + "source": "javlib.py", + "actor": get_table_el_multi_anchor(soup, "video_cast"), + "label": get_table_el_td(soup, "video_label"), + "tag": get_table_el_multi_anchor(soup, "video_genres"), + "number": get_table_el_td(soup, "video_id"), + "release": get_table_el_td(soup, "video_date"), + "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), + "series":'', + } + else: + dic = {} + + return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) + + +def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: + return lx.xpath(xpath)[0].strip() + + +def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: + tag = soup.find(id=tag_id).find("a") + + if tag is not None: + return tag.string.strip() + else: + return "" + + +def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str: + tags = soup.find(id=tag_id).find_all("a") + + return process(tags) + + +def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str: + tags = soup.find(id=tag_id).find_all("td", class_="text") + + return process(tags) + + +def process(tags: bs4.element.ResultSet) -> str: + values = [] + for tag in tags: + value = tag.string + if value is not None and value != "----": + values.append(value) + + return ",".join(x for x in values if x) + + +def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str: + title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()') + number = get_table_el_td(soup, "video_id") + + return title.replace(number, "").strip() + + +def get_cover(lx: html.HtmlComment) -> str: + return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src')) + + +if __name__ == "__main__": + lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"] + #lists = ["DVMC-003"] + for num in lists: + print(main(num)) diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py new file mode 100644 index 0000000..2c6391b --- /dev/null +++ b/WebCrawler/mgstage.py @@ -0,0 +1,120 @@ +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(a): + try: + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") + return result.replace('/', ',') + except: + return '' +def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','') +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1 + result2).strip('+').rstrip('mi') +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+') +def getYear(getRelease): + try: + result = str(re.search('\d{4}',getRelease).group()) + return result + except: + return getRelease +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace('/','-') +def getTag(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") + # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '') + return result +def getSeries(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') +def main(number2): + number=number2.upper() + htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) + soup = BeautifulSoup(htmlcode, 'lxml') + a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') + b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') + #print(b) + dic = { + 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), + 'studio': getStudio(a), + 'outline': getOutline(b), + 'runtime': getRuntime(a), + 'director': getDirector(a), + 'actor': getActor(a), + 'release': getRelease(a), + 'number': getNum(a), + 'cover': getCover(htmlcode), + 'imagecut': 0, + 'tag': getTag(a), + 'label':getLabel(a), + 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': '', + 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', + 'source': 'mgstage.py', + 'series': getSeries(a), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + #print(htmlcode) + +if __name__ == '__main__': + print(main('SIRO-4149')) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py new file mode 100644 index 0000000..fda4f2c --- /dev/null +++ b/WebCrawler/xcity.py @@ -0,0 +1,192 @@ +import re +from lxml import etree +import json +from bs4 import BeautifulSoup +from ADC_function import * + + +# import sys +# import io +# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) + +def getTitle(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = html.xpath('//*[@id="program_detail_title"]/text()')[0] + return result + + +def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0] + return result1 + + +def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img + a = actor.split(',') + d = {} + for i in a: + p = {i: ''} + d.update(p) + return d + + +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") + except: + result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") + return result.strip('+').replace("', '", '').replace('"', '') + + +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] + except: + return '' + try: + return re.findall('\d+',result1)[0] + except: + return '' + + +def getLabel(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] + return result + except: + return '' + + +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) + try: + result = html.xpath('//*[@id="hinban"]/text()')[0] + return result + except: + return '' + + +def getYear(getRelease): + try: + result = str(re.search('\d{4}', getRelease).group()) + return result + except: + return getRelease + + +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] + except: + return '' + try: + return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') + except: + return '' + + +def getTag(a): + result2=[] + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()') + for i in result1: + i=i.replace(u'\n','') + i=i.replace(u'\t','') + result2.append(i) + return result2 + + +def getCover_small(a, index=0): + # same issue mentioned below, + # javdb sometime returns multiple results + # DO NOT just get the firt one, get the one with correct index number + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + if not 'https' in result: + result = 'https:' + result + return result + + +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] + return 'https:' + result + except: + return '' + + +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') + return result + except: + return '' + + +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0] + except: + return '' + try: + return re.sub('\\\\\w*\d+','',result) + except: + return result + +def getSeries(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + try: + result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] + return result + except: + result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0] + return result + except: + return '' + + +def main(number): + try: + number = number.upper() + query_result = get_html( + 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') + html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] + detail_page = get_html('https://xcity.jp' + urls) + dic = { + 'actor': getActor(detail_page), + 'title': getTitle(detail_page), + 'studio': getStudio(detail_page), + 'outline': getOutline(detail_page), + 'runtime': getRuntime(detail_page), + 'director': getDirector(detail_page), + 'release': getRelease(detail_page), + 'number': getNum(detail_page), + 'cover': getCover(detail_page), + 'cover_small': '', + 'imagecut': 1, + 'tag': getTag(detail_page), + 'label': getLabel(detail_page), + 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': getActorPhoto(getActor(detail_page)), + 'website': 'https://xcity.jp' + urls, + 'source': 'xcity.py', + 'series': getSeries(detail_page), + } + except Exception as e: + # print(e) + dic = {"title": ""} + + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + +if __name__ == '__main__': + print(main('VNDS-2624')) diff --git a/core.py b/core.py index 7eb926d..ca1019c 100755 --- a/core.py +++ b/core.py @@ -17,6 +17,7 @@ import javdb import mgstage import xcity import javlib +import dlsite def escape_path(path, escape_literals: str): # Remove escape literals @@ -56,6 +57,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON "jav321": jav321.main, "xcity": xcity.main, "javlib": javlib.main, + "dlsite": dlsite.main, } # default fetch order list, from the beginning to the end @@ -74,6 +76,9 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON elif "fc2" in file_number or "FC2" in file_number: sources.insert(0, sources.pop(sources.index("fc2"))) + elif "RJ" in file_number or "rj" or "VJ" or "vj" in file_number: + sources.insert(0, sources.pop(sources.index("dlsite"))) + json_data = {} for source in sources: json_data = json.loads(func_mapping[source](file_number))