From f6da5db276c46e5185518326973070e01c82b467 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 12 Aug 2020 18:27:58 +0800 Subject: [PATCH] Update 3.7-2 --- avsox.py | 124 -------------------- fanza.py | 297 ------------------------------------------------ fc2fans_club.py | 165 --------------------------- jav321.py | 156 ------------------------- javbus.py | 167 --------------------------- javdb.py | 154 ------------------------- javlib.py | 110 ------------------ mgstage.py | 120 ------------------- xcity.py | 192 ------------------------------- 9 files changed, 1485 deletions(-) delete mode 100644 avsox.py delete mode 100644 fanza.py delete mode 100755 fc2fans_club.py delete mode 100644 jav321.py delete mode 100755 javbus.py delete mode 100755 javdb.py delete mode 100644 javlib.py delete mode 100755 mgstage.py delete mode 100644 xcity.py diff --git a/avsox.py b/avsox.py deleted file mode 100644 index c3d0b6a..0000000 --- a/avsox.py +++ /dev/null @@ -1,124 +0,0 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) - d = {} - for i in a: - l = i.img['src'] - t = i.span.get_text() - p2 = {t: l} - d.update(p2) - return d -def getTitle(a): - try: - html = etree.fromstring(a, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] - return result.replace('/', '') - except: - return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) - d = [] - for i in a: - d.append(i.span.get_text()) - return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') - return result1 -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") - return result1 -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") - return result1 -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") - return result1 -def getYear(release): - try: - result = str(re.search('\d{4}',release).group()) - return result - except: - return release -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") - return result1 -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") - return result -def getCover_small(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") - return result -def getTag(a): # 获取演员 - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - d = [] - for i in a: - d.append(i.get_text()) - return d -def getSeries(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") - return result1 - except: - return '' - -def main(number): - a = get_html('https://avsox.host/cn/search/' + number) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) - print(a) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) - print(a) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - web = get_html(result1) - soup = BeautifulSoup(web, 'lxml') - info = str(soup.find(attrs={'class': 'row movie'})) - dic = { - 'actor': getActor(web), - 'title': getTitle(web).strip(getNum(web)), - 'studio': getStudio(info), - 'outline': '',# - 'runtime': getRuntime(info), - 'director': '', # - 'release': getRelease(info), - 'number': getNum(info), - 'cover': getCover(web), - 'cover_small': getCover_small(a), - 'imagecut': 3, - 'tag': getTag(web), - 'label': getLabel(info), - 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(web), - 'website': result1, - 'source': 'avsox.py', - 'series': getSeries(info), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -if __name__ == "__main__": - print(main('012717_472')) \ No newline at end of file diff --git a/fanza.py b/fanza.py deleted file mode 100644 index 71aab6a..0000000 --- a/fanza.py +++ /dev/null @@ -1,297 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -import json -import re -from urllib.parse import urlencode - -from lxml import etree - -from ADC_function import * - -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - - -def getTitle(text): - html = etree.fromstring(text, etree.HTMLParser()) - result = html.xpath('//*[starts-with(@id, "title")]/text()')[0] - return result - - -def getActor(text): - # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(text, etree.HTMLParser()) - result = ( - str( - html.xpath( - "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - ) - ) - .strip(" ['']") - .replace("', '", ",") - ) - return result - - -def getStudio(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'メーカー')]/following-sibling::td/text()" - )[0] - return result - - -def getRuntime(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] - return re.search(r"\d+", str(result)).group() - - -def getLabel(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'レーベル:')]/following-sibling::td/text()" - )[0] - return result - - -def getNum(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'品番:')]/following-sibling::td/text()" - )[0] - return result - - -def getYear(getRelease): - try: - result = str(re.search(r"\d{4}", getRelease).group()) - return result - except: - return getRelease - - -def getRelease(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" - )[0].lstrip("\n") - except: - try: - result = html.xpath( - "//td[contains(text(),'発売日:')]/following-sibling::td/text()" - )[0].lstrip("\n") - except: - result = "----" - if result == "----": - try: - result = html.xpath( - "//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()" - )[0].lstrip("\n") - except: - try: - result = html.xpath( - "//td[contains(text(),'配信開始日:')]/following-sibling::td/text()" - )[0].lstrip("\n") - except: - pass - return result.replace("/", "-") - - -def getTag(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" - ) - except: - result = html.xpath( - "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" - ) - return result - - -def getCover(text, number): - html = etree.fromstring(text, etree.HTMLParser()) - cover_number = number - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # sometimes fanza modify _ to \u0005f for image id - if "_" in cover_number: - cover_number = cover_number.replace("_", r"\u005f") - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # (TODO) handle more edge case - # print(html) - # raise exception here, same behavior as before - # people's major requirement is fetching the picture - raise ValueError("can not find image") - return result - - -def getDirector(text): - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'監督:')]/following-sibling::td/text()" - )[0] - return result - - -def getOutline(text): - html = etree.fromstring(text, etree.HTMLParser()) - try: - result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( - "\n", "" - ) - if result == "": - result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( - "\n", "" - ) - except: - # (TODO) handle more edge case - # print(html) - return "" - return result - - -def getSeries(text): - try: - html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" - )[0] - except: - result = html.xpath( - "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" - )[0] - return result - except: - return "" - - -def main(number): - # fanza allow letter + number + underscore, normalize the input here - # @note: I only find the usage of underscore as h_test123456789 - fanza_search_number = number - # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix - if fanza_search_number.startswith("h-"): - fanza_search_number = fanza_search_number.replace("h-", "h_") - - fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() - - fanza_urls = [ - "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", - "https://www.dmm.co.jp/rental/-/detail/=/cid=", - ] - chosen_url = "" - - for url in fanza_urls: - chosen_url = url + fanza_search_number - htmlcode = get_html( - "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( - urlencode({"rurl": chosen_url}) - ) - ) - if "404 Not Found" not in htmlcode: - break - if "404 Not Found" in htmlcode: - return json.dumps({"title": "",}) - try: - # for some old page, the input number does not match the page - # for example, the url will be cid=test012 - # but the hinban on the page is test00012 - # so get the hinban first, and then pass it to following functions - fanza_hinban = getNum(htmlcode) - data = { - "title": getTitle(htmlcode).strip(), - "studio": getStudio(htmlcode), - "outline": getOutline(htmlcode), - "runtime": getRuntime(htmlcode), - "director": getDirector(htmlcode) if "anime" not in chosen_url else "", - "actor": getActor(htmlcode) if "anime" not in chosen_url else "", - "release": getRelease(htmlcode), - "number": fanza_hinban, - "cover": getCover(htmlcode, fanza_hinban), - "imagecut": 1, - "tag": getTag(htmlcode), - "label": getLabel(htmlcode), - "year": getYear( - getRelease(htmlcode) - ), # str(re.search('\d{4}',getRelease(a)).group()), - "actor_photo": "", - "website": chosen_url, - "source": "fanza.py", - "series": getSeries(htmlcode), - } - except: - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) # .encode('UTF-8') - return js - - -def main_htmlcode(number): - # fanza allow letter + number + underscore, normalize the input here - # @note: I only find the usage of underscore as h_test123456789 - fanza_search_number = number - # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix - if fanza_search_number.startswith("h-"): - fanza_search_number = fanza_search_number.replace("h-", "h_") - - fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() - - fanza_urls = [ - "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", - ] - chosen_url = "" - for url in fanza_urls: - chosen_url = url + fanza_search_number - htmlcode = get_html(chosen_url) - if "404 Not Found" not in htmlcode: - break - if "404 Not Found" in htmlcode: - return json.dumps({"title": "",}) - return htmlcode - - -if __name__ == "__main__": - print(main("DV-1562")) - print(main("96fad1217")) diff --git a/fc2fans_club.py b/fc2fans_club.py deleted file mode 100755 index 2c31a51..0000000 --- a/fc2fans_club.py +++ /dev/null @@ -1,165 +0,0 @@ -import re -from lxml import etree#need install -import json -import ADC_function -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(htmlcode): #获取厂商 - #print(htmlcode) - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") - result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) - #print(result2) - return result2 -def getActor(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") - return result - except: - return '' -def getStudio(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") - return result -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - #print(result) - return result -def getRelease(htmlcode2): # - #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') - html=etree.fromstring(htmlcode2,etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") - return result -def getCover(htmlcode,number,htmlcode2): #获取厂商 # - #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") - if result == '': - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") - return 'https://fc2club.com' + result2 - return 'http:' + result -def getOutline(htmlcode2): #获取番号 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') - return result -def getTag(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) - return result.strip(" ['']").replace("'",'').replace(' ','') -def getYear(release): - try: - result = re.search('\d{4}',release).group() - return result - except: - return '' - -def getTitle_fc2com(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] - return result -def getActor_fc2com(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] - return result - except: - return '' -def getStudio_fc2com(htmlcode): #获取厂商 - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") - return result - except: - return '' -def getNum_fc2com(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getRelease_fc2com(htmlcode2): # - html=etree.fromstring(htmlcode2,etree.HTMLParser()) - result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") - return result -def getCover_fc2com(htmlcode2): #获取厂商 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") - return 'http:' + result -def getOutline_fc2com(htmlcode2): #获取番号 # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') - return result -def getTag_fc2com(number): #获取番号 - htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) - result = re.findall('"tag":"(.*?)"', htmlcode) - return result -def getYear_fc2com(release): - try: - result = re.search('\d{4}',release).group() - return result - except: - return '' - -def main(number): - try: - number = number.replace('FC2-', '').replace('fc2-', '') - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') - htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') - actor = getActor(htmlcode) - if getActor(htmlcode) == '': - actor = 'FC2系列' - dic = { - 'title': getTitle(htmlcode), - 'studio': getStudio(htmlcode), - 'year': '',#str(re.search('\d{4}',getRelease(number)).group()), - 'outline': '',#getOutline(htmlcode2), - 'runtime': getYear(getRelease(htmlcode)), - 'director': getStudio(htmlcode), - 'actor': actor, - 'release': getRelease(number), - 'number': 'FC2-'+number, - 'label': '', - 'cover': getCover(htmlcode,number,htmlcode2), - 'imagecut': 0, - 'tag': getTag(htmlcode), - 'actor_photo':'', - 'website': 'https://fc2club.com//html/FC2-' + number + '.html', - 'source':'https://fc2club.com//html/FC2-' + number + '.html', - 'series': '', - } - if dic['title'] == '': - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) - actor = getActor(htmlcode) - if getActor(htmlcode) == '': - actor = 'FC2系列' - dic = { - 'title': getTitle_fc2com(htmlcode2), - 'studio': getStudio_fc2com(htmlcode2), - 'year': '', # str(re.search('\d{4}',getRelease(number)).group()), - 'outline': getOutline_fc2com(htmlcode2), - 'runtime': getYear_fc2com(getRelease(htmlcode2)), - 'director': getStudio_fc2com(htmlcode2), - 'actor': actor, - 'release': getRelease_fc2com(number), - 'number': 'FC2-' + number, - 'cover': getCover_fc2com(htmlcode2), - 'imagecut': 0, - 'tag': getTag_fc2com(number), - 'label': '', - 'actor_photo': '', - 'website': 'http://adult.contents.fc2.com/article/' + number + '/', - 'source': 'http://adult.contents.fc2.com/article/' + number + '/', - 'series': '', - } - except Exception as e: - # (TODO) better handle this - # print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') - return js - -if __name__ == '__main__': - print(main('1252953')) \ No newline at end of file diff --git a/jav321.py b/jav321.py deleted file mode 100644 index 7b0baae..0000000 --- a/jav321.py +++ /dev/null @@ -1,156 +0,0 @@ -import json -from bs4 import BeautifulSoup -from lxml import html -from ADC_function import post_html - - -def main(number: str) -> json: - result = post_html(url="https://www.jav321.com/search", query={"sn": number}) - soup = BeautifulSoup(result.text, "html.parser") - lx = html.fromstring(str(soup)) - - if "/video/" in result.url: - data = parse_info(soup) - dic = { - "title": get_title(lx), - "year": get_year(data), - "outline": get_outline(lx), - "director": "", - "cover": get_cover(lx), - "imagecut": 1, - "actor_photo": "", - "website": result.url, - "source": "jav321.py", - **data, - } - else: - dic = {} - - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - -def get_title(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip() - - -def parse_info(soup: BeautifulSoup) -> dict: - data = soup.select_one("div.row > div.col-md-9") - - if data: - dd = str(data).split("
") - data_dic = {} - for d in dd: - data_dic[get_bold_text(h=d)] = d - - return { - "actor": get_actor(data_dic), - "label": get_label(data_dic), - "studio": get_studio(data_dic), - "tag": get_tag(data_dic), - "number": get_number(data_dic), - "release": get_release(data_dic), - "runtime": get_runtime(data_dic), - "series": get_series(data_dic), - } - else: - return {} - - -def get_bold_text(h: str) -> str: - soup = BeautifulSoup(h, "html.parser") - if soup.b: - return soup.b.text - else: - return "UNKNOWN_TAG" - - -def get_anchor_info(h: str) -> str: - result = [] - - data = BeautifulSoup(h, "html.parser").find_all("a", href=True) - for d in data: - result.append(d.text) - - return ",".join(result) - - -def get_text_info(h: str) -> str: - return h.split(": ")[1] - - -def get_cover(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0] - - -def get_outline(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] - -def get_series2(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0] - - -def get_actor(data: hash) -> str: - if "女优" in data: - return get_anchor_info(data["女优"]) - else: - return "" - - -def get_label(data: hash) -> str: - if "片商" in data: - return get_anchor_info(data["片商"]) - else: - return "" - - -def get_tag(data: hash) -> str: - if "标签" in data: - return get_anchor_info(data["标签"]) - else: - return "" - - -def get_studio(data: hash) -> str: - if "片商" in data: - return get_anchor_info(data["片商"]) - else: - return "" - - -def get_number(data: hash) -> str: - if "番号" in data: - return get_text_info(data["番号"]) - else: - return "" - - -def get_release(data: hash) -> str: - if "发行日期" in data: - return get_text_info(data["发行日期"]) - else: - return "" - - -def get_runtime(data: hash) -> str: - if "播放时长" in data: - return get_text_info(data["播放时长"]) - else: - return "" - - -def get_year(data: hash) -> str: - if "release" in data: - return data["release"][:4] - else: - return "" - - -def get_series(data: hash) -> str: - if "系列" in data: - return get_anchor_info(data["系列"]) - else: - return "" - - -if __name__ == "__main__": - print(main("soe-259")) diff --git a/javbus.py b/javbus.py deleted file mode 100755 index 7d51a4d..0000000 --- a/javbus.py +++ /dev/null @@ -1,167 +0,0 @@ -import re -from pyquery import PyQuery as pq#need install -from lxml import etree#need install -from bs4 import BeautifulSoup#need install -import json -from ADC_function import * -import fanza - -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) - d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") - p2={t:p} - d.update(p2) - return d -def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - title=str(doc('div.container h3').text()).replace(' ','-') - try: - title2 = re.sub('n\d+-','',title) - return title2 - except: - return title -def getStudio(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return image.attr('href') -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find(text=re.compile('分鐘')) - return a -def getActor(htmlcode): #获取女优 - b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) - return b -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - return result -def getCID(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - #print(htmlcode) - string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') - result = re.sub('/.*?.jpg','',string) - return result -def getOutline(htmlcode): #获取演员 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','') - return result - except: - return '' -def getSerise(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - return result - except: - return '' -def getTag(htmlcode): # 获取演员 - tag = [] - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - for i in a: - if 'onmouseout' in str(i): - continue - tag.append(i.get_text()) - return tag - -def main_uncensored(number): - htmlcode = get_html('https://www.javbus.com/' + number) - if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) - try: - dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) - except: - dww_htmlcode = '' - dic = { - 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), - 'studio': getStudio(htmlcode), - 'year': getYear(htmlcode), - 'outline': getOutline(dww_htmlcode), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'tag': getTag(htmlcode), - 'label': getSerise(htmlcode), - 'imagecut': 0, - 'actor_photo': '', - 'website': 'https://www.javbus.com/' + number, - 'source': 'javbus.py', - 'series': getSerise(htmlcode), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - - -def main(number): - try: - try: - htmlcode = get_html('https://www.javbus.com/' + number) - try: - dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) - except: - dww_htmlcode = '' - dic = { - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - 'studio': getStudio(htmlcode), - 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(dww_htmlcode), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'imagecut': 1, - 'tag': getTag(htmlcode), - 'label': getSerise(htmlcode), - 'actor_photo': getActorPhoto(htmlcode), - 'website': 'https://www.javbus.com/' + number, - 'source': 'javbus.py', - 'series': getSerise(htmlcode), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, - separators=(',', ':'), ) # .encode('UTF-8') - return js - except: - return main_uncensored(number) - except: - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) - return js - -if __name__ == "__main__" : - print(main('ipx-292')) diff --git a/javdb.py b/javdb.py deleted file mode 100755 index b1656d0..0000000 --- a/javdb.py +++ /dev/null @@ -1,154 +0,0 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) - result = html.xpath("/html/body/section/div/h2/strong/text()")[0] - return result -def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') -def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d={} - for i in a: - p={i:''} - d.update(p) - return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") - return str(result2 + result1).strip('+') -def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+') -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') - return result - except: - result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') - return result - -def getCover_small(a, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result - except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] - except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] - return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") - return result -def getSeries(a): - #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def main(number): - try: - number = number.upper() - try: - query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') - except: - query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - # javdb sometime returns multiple results, - # and the first elememt maybe not the one we are looking for - # iterate all candidates and find the match one - urls = html.xpath('//*[@id="videos"]/div/div/a/@href') - ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') - correct_url = urls[ids.index(number)] - detail_page = get_html('https://javdb.com' + correct_url) - - # If gray image exists ,then replace with normal cover - cover_small = getCover_small(query_result, index=ids.index(number)) - if 'placeholder' in cover_small: - cover_small = getCover(detail_page) - - - dic = { - 'actor': getActor(detail_page), - 'title': getTitle(detail_page), - 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), - 'release': getRelease(detail_page), - 'number': getNum(detail_page), - 'cover': getCover(detail_page), - 'cover_small': cover_small, - 'imagecut': 3, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), - 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(getActor(detail_page)), - 'website': 'https://javdb.com' + correct_url, - 'source': 'javdb.py', - 'series': getSeries(detail_page), - } - except Exception as e: - # print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -# main('DV-1562') -# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") -if __name__ == "__main__": - print(main('snyz-007')) diff --git a/javlib.py b/javlib.py deleted file mode 100644 index cb6f78b..0000000 --- a/javlib.py +++ /dev/null @@ -1,110 +0,0 @@ -import json -import bs4 -from bs4 import BeautifulSoup -from lxml import html -from http.cookies import SimpleCookie - -from ADC_function import get_javlib_cookie, get_html - - -def main(number: str): - raw_cookies, user_agent = get_javlib_cookie() - - # Blank cookies mean javlib site return error - if not raw_cookies: - return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - # Manually construct a dictionary - s_cookie = SimpleCookie() - s_cookie.load(raw_cookies) - cookies = {} - for key, morsel in s_cookie.items(): - cookies[key] = morsel.value - - # Scraping - result = get_html( - "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number), - cookies=cookies, - ua=user_agent, - return_type="object" - ) - soup = BeautifulSoup(result.text, "html.parser") - lx = html.fromstring(str(soup)) - - if "/?v=jav" in result.url: - dic = { - "title": get_title(lx, soup), - "studio": get_table_el_single_anchor(soup, "video_maker"), - "year": get_table_el_td(soup, "video_date")[:4], - "outline": "", - "director": get_table_el_single_anchor(soup, "video_director"), - "cover": get_cover(lx), - "imagecut": 1, - "actor_photo": "", - "website": result.url, - "source": "javlib.py", - "actor": get_table_el_multi_anchor(soup, "video_cast"), - "label": get_table_el_td(soup, "video_label"), - "tag": get_table_el_multi_anchor(soup, "video_genres"), - "number": get_table_el_td(soup, "video_id"), - "release": get_table_el_td(soup, "video_date"), - "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), - "series":'', - } - else: - dic = {} - - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - -def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: - return lx.xpath(xpath)[0].strip() - - -def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: - tag = soup.find(id=tag_id).find("a") - - if tag is not None: - return tag.string.strip() - else: - return "" - - -def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str: - tags = soup.find(id=tag_id).find_all("a") - - return process(tags) - - -def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str: - tags = soup.find(id=tag_id).find_all("td", class_="text") - - return process(tags) - - -def process(tags: bs4.element.ResultSet) -> str: - values = [] - for tag in tags: - value = tag.string - if value is not None and value != "----": - values.append(value) - - return ",".join(x for x in values if x) - - -def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str: - title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()') - number = get_table_el_td(soup, "video_id") - - return title.replace(number, "").strip() - - -def get_cover(lx: html.HtmlComment) -> str: - return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src')) - - -if __name__ == "__main__": - lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"] - #lists = ["DVMC-003"] - for num in lists: - print(main(num)) diff --git a/mgstage.py b/mgstage.py deleted file mode 100755 index 2c6391b..0000000 --- a/mgstage.py +++ /dev/null @@ -1,120 +0,0 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(a): - try: - html = etree.fromstring(a, etree.HTMLParser()) - result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") - return result.replace('/', ',') - except: - return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1+result2).strip('+').replace("', '",'').replace('"','') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '",'').replace('"','') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+') -def getYear(getRelease): - try: - result = str(re.search('\d{4}',getRelease).group()) - return result - except: - return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace('/','-') -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") - # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src - return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '",'').replace('"','') -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '') - return result -def getSeries(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( - '\\n') - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def main(number2): - number=number2.upper() - htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) - soup = BeautifulSoup(htmlcode, 'lxml') - a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') - b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') - #print(b) - dic = { - 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), - 'studio': getStudio(a), - 'outline': getOutline(b), - 'runtime': getRuntime(a), - 'director': getDirector(a), - 'actor': getActor(a), - 'release': getRelease(a), - 'number': getNum(a), - 'cover': getCover(htmlcode), - 'imagecut': 0, - 'tag': getTag(a), - 'label':getLabel(a), - 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': '', - 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', - 'source': 'mgstage.py', - 'series': getSeries(a), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - #print(htmlcode) - -if __name__ == '__main__': - print(main('SIRO-4149')) diff --git a/xcity.py b/xcity.py deleted file mode 100644 index e3f04cb..0000000 --- a/xcity.py +++ /dev/null @@ -1,192 +0,0 @@ -import re -from lxml import etree -import json -from bs4 import BeautifulSoup -from ADC_function import * - - -# import sys -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) - result = html.xpath('//*[@id="program_detail_title"]/text()')[0] - return result - - -def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0] - return result1 - - -def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d = {} - for i in a: - p = {i: ''} - d.update(p) - return d - - -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") - except: - result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") - return result.strip('+').replace("', '", '').replace('"', '') - - -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] - except: - return '' - try: - return re.findall('\d+',result1)[0] - except: - return '' - - -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] - return result - except: - return '' - - -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) - try: - result = html.xpath('//*[@id="hinban"]/text()')[0] - return result - except: - return '' - - -def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease - - -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] - except: - return '' - try: - return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') - except: - return '' - - -def getTag(a): - result2=[] - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()') - for i in result1: - i=i.replace(u'\n','') - i=i.replace(u'\t','') - result2.append(i) - return result2 - - -def getCover_small(a, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result - - -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] - return 'https:' + result - except: - return '' - - -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') - return result - except: - return '' - - -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0] - except: - return '' - try: - return re.sub('\\\\\w*\d+','',result) - except: - return result - -def getSeries(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - try: - result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] - return result - except: - result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0] - return result - except: - return '' - - -def main(number): - try: - number = number.upper() - query_result = get_html( - 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] - detail_page = get_html('https://xcity.jp' + urls) - dic = { - 'actor': getActor(detail_page), - 'title': getTitle(detail_page), - 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), - 'release': getRelease(detail_page), - 'number': getNum(detail_page), - 'cover': getCover(detail_page), - 'cover_small': '', - 'imagecut': 1, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), - 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(getActor(detail_page)), - 'website': 'https://xcity.jp' + urls, - 'source': 'xcity.py', - 'series': getSeries(detail_page), - } - except Exception as e: - # print(e) - dic = {"title": ""} - - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -if __name__ == '__main__': - print(main('VNDS-2624'))