diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py deleted file mode 100644 index d25b8ad..0000000 --- a/WebCrawler/airav.py +++ /dev/null @@ -1,227 +0,0 @@ -import sys -sys.path.append('../') -from bs4 import BeautifulSoup#need install -from ADC_function import * -from WebCrawler import javbus - -''' -API -注册:https://www.airav.wiki/api/auth/signup -设置:https://www.airav.wiki/api/get_web_settings -搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search= -搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search= -''' -host = 'https://www.airav.wiki' - -# airav这个网站没有演员图片,所以直接使用javbus的图 -def getActorPhoto(javbus_json): - result = javbus_json.get('actor_photo') - if isinstance(result, dict) and len(result): - return result - return '' - -def getTitle(htmlcode): #获取标题 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - title = str(html.xpath('/html/head/title/text()')[0]) - result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() - return result - -def getStudio(htmlcode, javbus_json): #获取厂商 已修改 - # javbus如果有数据以它为准 - result = javbus_json.get('studio') - if isinstance(result, str) and len(result): - return result - html = etree.fromstring(htmlcode,etree.HTMLParser()) - return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']") -def getYear(htmlcode, javbus_json): #获取年份 - result = javbus_json.get('year') - if isinstance(result, str) and len(result): - return result - release = getRelease(htmlcode, javbus_json) - if len(release) != len('2000-01-01'): - return '' - return release[:4] -def getCover(htmlcode, javbus_json): #获取封面图片 - result = javbus_json.get('cover') - if isinstance(result, str) and len(result): - return result - html = etree.fromstring(htmlcode, etree.HTMLParser()) - return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0] -def getRelease(htmlcode, javbus_json): #获取出版日期 - result = javbus_json.get('release') - if isinstance(result, str) and len(result): - return result - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group() - except: - return '' - return result -def getRuntime(javbus_json): #获取播放时长 - result = javbus_json.get('runtime') - if isinstance(result, str) and len(result): - return result - return '' -# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先 -def getActor(htmlcode, javbus_json): #获取女优 - b=[] - html = etree.fromstring(htmlcode, etree.HTMLParser()) - a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()') - for v in a: - v = v.strip() - if len(v): - b.append(v) - if len(b): - return b - result = javbus_json.get('actor') - if isinstance(result, list) and len(result): - return result - return [] -def getNum(htmlcode, javbus_json): #获取番号 - result = javbus_json.get('number') - if isinstance(result, str) and len(result): - return result - html = etree.fromstring(htmlcode, etree.HTMLParser()) - title = str(html.xpath('/html/head/title/text()')[0]) - result = str(re.findall('^\[(.*?)]', title)[0]) - return result -def getDirector(javbus_json): #获取导演 已修改 - result = javbus_json.get('director') - if isinstance(result, str) and len(result): - return result - return '' -def getOutline(htmlcode): #获取概述 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - try: - result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip() - return result - except: - return '' -def getSerise(javbus_json): #获取系列 已修改 - result = javbus_json.get('series') - if isinstance(result, str) and len(result): - return result - return '' -def getTag(htmlcode): # 获取标签 - tag = [] - soup = BeautifulSoup(htmlcode, 'lxml') - x = soup.find_all(attrs={'class': 'tagBtnMargin'}) - a = x[0].find_all('a') - - for i in a: - tag.append(i.get_text()) - return tag - -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'
[\s\S]*?
') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r' 0: - # search_result = {"offset": 0,"count": 4,"result": [ - # {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう", - # "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"}, - # {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜", - # "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"}, - # {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希", - # "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"}, - # {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう", - # "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"} - # ],"status": "ok"} - search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page)) - - try: - json_data = json.loads(search_result) - except json.decoder.JSONDecodeError: - # print("[-]Json decoder error!") - return [] - - result_offset = int(json_data["offset"]) - result_count = int(json_data["count"]) - result_size = len(json_data["result"]) - if result_count <= 0 or result_size <= 0: - return result - elif result_count > result_offset + result_size: #请求下一页内容 - result.extend(json_data["result"]) - page += 1 - elif result_count == result_offset + result_size: #请求最后一页内容 - result.extend(json_data["result"]) - page = 0 - else: - page = 0 - - return result - -def main(number): - try: - try: - htmlcode = get_html('https://cn.airav.wiki/video/' + number) - javbus_json = json.loads(javbus.main(number)) - - except: - # print(number) - pass - - dic = { - # 标题可使用airav - 'title': getTitle(htmlcode), - # 制作商先找javbus,如果没有再找本站 - 'studio': getStudio(htmlcode, javbus_json), - # 年份先试javbus,如果没有再找本站 - 'year': getYear(htmlcode, javbus_json), - # 简介 使用 airav - 'outline': getOutline(htmlcode), - # 使用javbus - 'runtime': getRuntime(javbus_json), - # 导演 使用javbus - 'director': getDirector(javbus_json), - # 演员 先试airav - 'actor': getActor(htmlcode, javbus_json), - # 发售日先试javbus - 'release': getRelease(htmlcode, javbus_json), - # 番号使用javbus - 'number': getNum(htmlcode, javbus_json), - # 封面链接 使用javbus - 'cover': getCover(htmlcode, javbus_json), - # 剧照获取 - 'extrafanart': getExtrafanart(htmlcode), - 'imagecut': 1, - # 使用 airav - 'tag': getTag(htmlcode), - # 使用javbus - 'label': getSerise(javbus_json), - 'actor_photo': getActorPhoto(javbus_json), - 'website': 'https://www.airav.wiki/video/' + number, - 'source': 'airav.py', - # 使用javbus - 'series': getSerise(javbus_json) - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') - return js - except Exception as e: - if config.getInstance().debug(): - print(e) - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) - return js - - -if __name__ == '__main__': - config.getInstance().set_override("actor_photo:download_for_kodi=1") - config.getInstance().set_override("debug_mode:switch=1") - print(main('ADV-R0624')) # javbus页面返回404, airav有数据 - print(main('ADN-188')) # 一人 - print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字 diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py deleted file mode 100644 index a18eab6..0000000 --- a/WebCrawler/avsox.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -sys.path.append('..') -from ADC_function import * -from WebCrawler.storyline import getStoryline -from WebCrawler.crawler import * -# import io -# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) - -def getActorPhoto(html): - a = html.xpath('//a[@class="avatar-box"]') - d = {} - for i in a: - l = i.find('.//img').attrib['src'] - t = i.find('span').text - p2 = {t: l} - d.update(p2) - return d - -def getActor(html): - a = html.xpath('//a[@class="avatar-box"]') - d = [] - for i in a: - d.append(i.find('span').text) - return d - -def getCover_small(html): - result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") - return result -def getTag(html): - x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return [i.strip() for i in x[2:]] if len(x) > 2 else [] - -def main(number): - html = get_html('https://tellme.pw/avsox') - site = Crawler(html).getString('//div[@class="container"]/div/a/@href') - a = get_html(site + '/cn/search/' + number) - html = Crawler(a) - result1 = html.getString('//*[@id="waterfall"]/div/a/@href') - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html(site + '/cn/search/' + number.replace('-', '_')) - html = Crawler(a) - result1 = html.getString('//*[@id="waterfall"]/div/a/@href') - if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html(site + '/cn/search/' + number.replace('_', '')) - html = Crawler(a) - result1 = html.getString('//*[@id="waterfall"]/div/a/@href') - detail = get_html("https:" + result1) - lx = etree.fromstring(detail, etree.HTMLParser()) - avsox_crawler2 = Crawler(a) - avsox_crawler = Crawler(detail) - try: - new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()') - if new_number.upper() != number.upper(): - raise ValueError('number not found') - title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number) - dic = { - 'actor': getActor(lx), - 'title': title, - 'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '), - 'outline': getStoryline(number, title), - 'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''), - 'director': '', # - 'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'), - 'number': new_number, - 'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'), - #'cover_small' : getCover_small(html), - 'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'), - 'imagecut': 3, - 'tag': getTag(lx), - 'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'), - 'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0], - 'actor_photo': getActorPhoto(lx), - 'website': "https:" + result1, - 'source': 'avsox.py', - 'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'), - } - except Exception as e: - if config.getInstance().debug(): - print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -if __name__ == "__main__": - print(main('012717_472')) - print(main('1')) # got fake result raise 'number not found' diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py deleted file mode 100755 index 50cbcc1..0000000 --- a/WebCrawler/carib.py +++ /dev/null @@ -1,133 +0,0 @@ -import sys -sys.path.append('../') -from lxml import html -from ADC_function import * -from WebCrawler.storyline import getStoryline - - -G_SITE = 'https://www.caribbeancom.com' - - -def main(number: str) -> json: - try: - url = f'{G_SITE}/moviepages/{number}/index.html' - result, session = get_html_session(url, return_type='session') - htmlcode = result.content.decode('euc-jp') - if not result or not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode: - raise ValueError("page not found") - - lx = html.fromstring(htmlcode) - title = get_title(lx) - - dic = { - 'title': title, - 'studio': '加勒比', - 'year': get_year(lx), - 'outline': get_outline(lx, number, title), - 'runtime': get_runtime(lx), - 'director': '', - 'actor': get_actor(lx), - 'release': get_release(lx), - 'number': number, - 'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg', - 'tag': get_tag(lx), - 'extrafanart': get_extrafanart(lx), - 'label': get_series(lx), - 'imagecut': 1, - 'website': f'{G_SITE}/moviepages/{number}/index.html', - 'source': 'carib.py', - 'series': get_series(lx), - '无码': True - } - if config.getInstance().download_actor_photo_for_kodi(): - dic['actor_photo'] = get_actor_photo(lx, session) - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) - return js - - except Exception as e: - if config.getInstance().debug(): - print(e) - dic = {"title": ""} - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - -def get_title(lx: html.HtmlElement) -> str: - return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() - -def get_year(lx: html.HtmlElement) -> str: - return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] - -def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: - o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() - g = getStoryline(number, title, 无码=True) - if len(g): - return g - return o - -def get_release(lx: html.HtmlElement) -> str: - return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') - -def get_actor(lx: html.HtmlElement): - r = [] - actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()") - for act in actors: - if str(act) != '他': - r.append(act) - return r - -def get_tag(lx: html.HtmlElement) -> str: - genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()") - return genres - -def get_extrafanart(lx: html.HtmlElement) -> str: - r = [] - genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href") - for g in genres: - jpg = str(g) - if '/member/' in jpg: - break - else: - r.append('https://www.caribbeancom.com' + jpg) - return r - -def get_series(lx: html.HtmlElement) -> str: - try: - return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip() - except: - return '' - -def get_runtime(lx: html.HtmlElement) -> str: - return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() - -def get_actor_photo(lx, session): - htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") - names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") - t = {} - for name, a in zip(names, htmla): - if name.strip() == '他': - continue - p = {name.strip(): a.attrib['href']} - t.update(p) - o = {} - for k, v in t.items(): - if '/search_act/' not in v: - continue - r = session.get(urljoin(G_SITE, v)) - if not r.ok: - continue - html = r.text - pos = html.find('.full-bg') - if pos<0: - continue - css = html[pos:pos+100] - cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) - if not cssBGjpgs or not len(cssBGjpgs[0]): - continue - p = {k: urljoin(r.url, cssBGjpgs[0])} - o.update(p) - return o - -if __name__ == "__main__": - print(main("070116-197")) # actor have photo - print(main("041721-001")) - print(main("080520-001")) diff --git a/WebCrawler/crawler.py b/WebCrawler/crawler.py deleted file mode 100644 index e6176b6..0000000 --- a/WebCrawler/crawler.py +++ /dev/null @@ -1,28 +0,0 @@ -from lxml import etree - -class Crawler: - def __init__(self,htmlcode): - self.html = etree.HTML(htmlcode) - - def getString(self,_xpath): - if _xpath == "": - return "" - result = self.html.xpath(_xpath) - try: - return result[0] - except: - return "" - - def getStrings(self,_xpath): - result = self.html.xpath(_xpath) - try: - return result - except: - return "" - - def getOutline(self,_xpath): - result = self.html.xpath(_xpath) - try: - return "\n".join(result) - except: - return "" \ No newline at end of file diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py deleted file mode 100644 index 54ed6f7..0000000 --- a/WebCrawler/dlsite.py +++ /dev/null @@ -1,185 +0,0 @@ -import re -from lxml import etree -import json -import sys -sys.path.append('../') -from ADC_function import * - -def getTitle(html): - result = str(html.xpath('/html/head/title/text()')[0]) - result = result[:result.rfind(' | DLsite')] - result = result[:result.rfind(' [')] - result = result.replace('【HD版】', '') - return result -def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - try: - result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()') - except: - result1 = '' - return result1 -def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d={} - for i in a: - p={i:''} - d.update(p) - return d -def getStudio(html): - try: - try: - result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0] - except: - result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] - except: - result = '' - return result -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(html): - try: - try: - result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] - except: - result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] - except: - result = '' - return result -def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease -def getRelease(html): - result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0] - return result1.replace('年','-').replace('月','-').replace('日','') -def getTag(html): - try: - result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()') - return result - except: - return '' - -def getCover_small(a, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - try: - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result - except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result -def getCover(html): - result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0] - return result.replace('.webp', '.jpg') -def getDirector(html): - try: - result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0] - except: - result = '' - return result -def getOutline(html): - total = [] - result = html.xpath('//*[@class="work_parts_area"]/p/text()') - for i in result: - total.append(i.strip('\r\n')) - return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") -def getSeries(html): - try: - try: - result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] - except: - result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] - except: - result = '' - return result -# -def getExtrafanart(html): - try: - result = [] - for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'): - result.append("https:" + i) - except: - result = '' - return result -def main(number): - try: - if "RJ" in number or "VJ" in number: - number = number.upper() - htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'}) - html = etree.fromstring(htmlcode, etree.HTMLParser()) - else: - htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}) - html = etree.HTML(htmlcode) - search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') - if len(search_result) == 0: - number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","") - html = etree.HTML(get_html( - f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})) - search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') - if len(search_result) == 0: - if "~" in number: - number = number.replace("~","〜") - elif "〜" in number: - number = number.replace("〜","~") - html = etree.HTML(get_html( - f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})) - search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') - if len(search_result) == 0: - number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '') - html = etree.HTML(get_html( - f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})) - search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') - a = search_result[0] - html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'})) - number = str(re.findall("\wJ\w+",a)).strip(" [']") - dic = { - 'actor': getStudio(html), - 'title': getTitle(html), - 'studio': getStudio(html), - 'outline': getOutline(html), - 'runtime': '', - 'director': getDirector(html), - 'release': getRelease(html), - 'number': number, - 'cover': 'https:' + getCover(html), - 'cover_small': '', - 'imagecut': 4, - 'tag': getTag(html), - 'label': getLabel(html), - 'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': '', - 'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html', - 'source': 'dlsite.py', - 'series': getSeries(html), - 'extrafanart':getExtrafanart(html), - 'allow_number_change':True, - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - except Exception as e: - if config.getInstance().debug(): - print(e) - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) - return js - -# main('DV-1562') -# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") -if __name__ == "__main__": - config.getInstance().set_override("debug_mode:switch=1") - print(main('牝教師4~穢された教壇~ 「生意気ドジっ娘女教師・美結~高飛車ハメ堕ち2濁金」')) - print(main('RJ329607')) diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py deleted file mode 100644 index 38a919a..0000000 --- a/WebCrawler/fanza.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -import sys -sys.path.append('../') -from urllib.parse import urlencode - -from ADC_function import * -from WebCrawler.crawler import * - -class fanzaCrawler(Crawler): - def getFanzaString(self,string): - result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']") - result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']") - return result1+result2 - - def getFanzaStrings(self, string): - result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()") - if len(result1) > 0: - return result1 - result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()") - return result2 - - -def getRelease(fanza_Crawler): - result = fanza_Crawler.getFanzaString('発売日:') - if result == '' or result == '----': - result = fanza_Crawler.getFanzaString('配信開始日:') - return result.replace("/", "-").strip('\\n') - - -def getCover(html, number): - cover_number = number - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # sometimes fanza modify _ to \u0005f for image id - if "_" in cover_number: - cover_number = cover_number.replace("_", r"\u005f") - try: - result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] - except: - # (TODO) handle more edge case - # print(html) - # raise exception here, same behavior as before - # people's major requirement is fetching the picture - raise ValueError("can not find image") - return result - - -def getOutline(html): - try: - result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "") - if result == "": - result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "") - except: - # (TODO) handle more edge case - # print(html) - return "" - return result - - -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - s = [] - for img_url in extrafanart_imgs: - img_urls = img_url.rsplit('-', 1) - img_url = img_urls[0] + 'jp-' + img_urls[1] - s.append(img_url) - return s - return '' - -def main(number): - # fanza allow letter + number + underscore, normalize the input here - # @note: I only find the usage of underscore as h_test123456789 - fanza_search_number = number - # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix - if fanza_search_number.startswith("h-"): - fanza_search_number = fanza_search_number.replace("h-", "h_") - - fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() - - fanza_urls = [ - "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", - "https://www.dmm.co.jp/rental/-/detail/=/cid=", - ] - chosen_url = "" - fanza_Crawler = '' - - for url in fanza_urls: - chosen_url = url + fanza_search_number - htmlcode = get_html( - "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( - urlencode({"rurl": chosen_url}) - ) - ) - fanza_Crawler = fanzaCrawler(htmlcode) - if "404 Not Found" not in htmlcode: - break - if "404 Not Found" in htmlcode: - return json.dumps({"title": "",}) - try: - # for some old page, the input number does not match the page - # for example, the url will be cid=test012 - # but the hinban on the page is test00012 - # so get the hinban first, and then pass it to following functions - fanza_hinban = fanza_Crawler.getFanzaString('品番:') - out_num = fanza_hinban - number_lo = number.lower() - html = etree.fromstring(htmlcode, etree.HTMLParser()) - if (re.sub('-|_', '', number_lo) == fanza_hinban or - number_lo.replace('-', '00') == fanza_hinban or - number_lo.replace('-', '') + 'so' == fanza_hinban - ): - out_num = number - - director = fanza_Crawler.getFanzaString('監督:') - if "anime" in chosen_url: - director = "" - actor = fanza_Crawler.getStrings("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()") - if "anime" in chosen_url: - actor = "" - # ---- - series = fanza_Crawler.getFanzaString('シリーズ:') - if series == "----": - series = "" - label = fanza_Crawler.getFanzaString('レーベル') - if label == "----": - label = "" - - data = { - "title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(), - "studio": fanza_Crawler.getFanzaString('メーカー'), - "outline": getOutline(html), - "runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"), - "director": director, - "actor": actor, - "release": getRelease(fanza_Crawler), - "number": out_num, - "cover": getCover(html, fanza_hinban), - "imagecut": 1, - "tag": fanza_Crawler.getFanzaStrings('ジャンル:'), - "extrafanart": getExtrafanart(htmlcode), - "label": label, - "year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()), - "actor_photo": "", - "website": chosen_url, - "source": "fanza.py", - "series": series, - } - except Exception as e: - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) # .encode('UTF-8') - return js - - -def main_htmlcode(number): - # fanza allow letter + number + underscore, normalize the input here - # @note: I only find the usage of underscore as h_test123456789 - fanza_search_number = number - # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix - if fanza_search_number.startswith("h-"): - fanza_search_number = fanza_search_number.replace("h-", "h_") - - fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() - - fanza_urls = [ - "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", - "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", - ] - chosen_url = "" - for url in fanza_urls: - chosen_url = url + fanza_search_number - htmlcode = get_html(chosen_url) - if "404 Not Found" not in htmlcode: - break - if "404 Not Found" in htmlcode: - return json.dumps({"title": "",}) - return htmlcode - - -if __name__ == "__main__": - # print(main("DV-1562")) - # print(main("96fad1217")) - print(main("AES-002")) - print(main("MIAA-391")) - print(main("OBA-326")) diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py deleted file mode 100644 index 7eae92b..0000000 --- a/WebCrawler/fc2.py +++ /dev/null @@ -1,80 +0,0 @@ -import sys -sys.path.append('../') -import re -import json -import config -import ADC_function -from WebCrawler.crawler import * - -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<a href=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return extrafanart_imgs - return '' - -def getTrailer(htmlcode, number): - video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'') - video = video_pather.findall(htmlcode) - if video: - try: - video_url = video[0].replace('\'', '') - video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + number + '/sample?key=' + video_url - url_json = eval(ADC_function.get_html(video_url))['path'].replace('\\', '') - return url_json - except: - return '' - else: - return '' - -def main(number): - try: - number = number.replace('FC2-', '').replace('fc2-', '') - htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8') - fc2_crawler = Crawler(htmlcode2) - actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()') - if actor == "": - actor = '素人' - lx = etree.fromstring(htmlcode2, etree.HTMLParser()) - cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src") - cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover) - release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\ - strip(" ['販売日 : ']").replace('/','-') - dic = { - 'title': fc2_crawler.getString('/html/head/title/text()'), - 'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'), - 'year': re.findall('\d{4}',release)[0], - 'outline': '', # getOutline_fc2com(htmlcode2), - 'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]), - 'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'), - 'actor': actor, - 'release': release, - 'number': 'FC2-' + number, - 'label': '', - 'cover': cover, - 'thumb': cover, - 'extrafanart': getExtrafanart(htmlcode2), - "trailer": getTrailer(htmlcode2, number), - 'imagecut': 0, - 'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"), - 'actor_photo': '', - 'website': 'https://adult.contents.fc2.com/article/' + number + '/', - 'source': 'https://adult.contents.fc2.com/article/' + number + '/', - 'series': '', - } - except Exception as e: - if ADC_function.config.getInstance().debug(): - print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -if __name__ == '__main__': - config.getInstance().set_override("debug_mode:switch=1") - #print(main('FC2-2182382')) - #print(main('FC2-607854')) - print(main('FC2-2787433')) diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py deleted file mode 100644 index 53d0d58..0000000 --- a/WebCrawler/fc2club.py +++ /dev/null @@ -1,96 +0,0 @@ -import sys -sys.path.append('../') -from lxml import etree#need install -import json -import ADC_function - -def getTitle_fc2com(htmlcode): #获取标题 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']") - return result -def getActor_fc2com(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[5]/a/text()')).strip(" ['']") - return result - except: - return '' -def getStudio_fc2com(htmlcode): #获取厂商 - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[3]/a[1]/text()')).strip(" ['']") - return result - except: - return '' -def getNum_fc2com(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - title = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']") - num = title.split(' ')[0] - if num.startswith('FC2') != True: - num = '' - return num -def getRelease_fc2com(htmlcode2): # - return '' -def getCover_fc2com(htmlcode2): #获取img # - html = etree.fromstring(htmlcode2, etree.HTMLParser()) - imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']") - imgUrl = imgUrl.replace('../','https://fc2club.net/') - return imgUrl -def getTag_fc2com(htmlcode): #获取tag - html = etree.fromstring(htmlcode,etree.HTMLParser()) - a = html.xpath('//*[@class="show-top-grids"]/div[1]/h5[4]/a') - tag = [] - for i in range(len(a)): - tag.append(str(a[i].xpath('text()')).strip("['']")) - return tag -def getYear_fc2com(release): - return '' - -def getExtrafanart(htmlcode): # 获取剧照 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']") - imgUrl = imgUrl.replace('../','https://fc2club.net/') - return imgUrl - -def getTrailer(htmlcode): - return '' - -def main(number): - try: - number = number.replace('FC2-', '').replace('fc2-', '') - webUrl = 'https://fc2club.net/html/FC2-' + number + '.html' - htmlcode2 = ADC_function.get_html(webUrl) - actor = getActor_fc2com(htmlcode2) - if getActor_fc2com(htmlcode2) == '': - actor = 'FC2系列' - dic = { - 'title': getTitle_fc2com(htmlcode2), - 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), - 'outline': '', # getOutline_fc2com(htmlcode2), - 'runtime': '', - 'director': getStudio_fc2com(htmlcode2), - 'actor': actor, - 'release': getRelease_fc2com(htmlcode2), - 'number': 'FC2-' + number, - 'label': '', - 'cover': getCover_fc2com(htmlcode2), - 'extrafanart': getExtrafanart(htmlcode2), - "trailer": getTrailer(htmlcode2), - 'imagecut': 0, - 'tag': getTag_fc2com(htmlcode2), - 'actor_photo': '', - 'website': 'https://fc2club.net/html/FC2-' + number + '.html/', - 'source': 'https://fc2club.net/html/FC2-' + number + '.html/', - 'series': '', - } - except Exception as e: - if ADC_function.config.getInstance().debug(): - print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -if __name__ == '__main__': - print(main('FC2-402422')) - diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py deleted file mode 100644 index 749da1f..0000000 --- a/WebCrawler/gcolle.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -sys.path.append('../') - -from WebCrawler.crawler import * -from ADC_function import * - -def main(number): - save_cookies = False - cookie_filename = 'gcolle.json' - try: - gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename) - session = get_html_session(cookies=gcolle_cooikes) - number = number.upper().replace('GCOLLE-','') - - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text - gcolle_crawler = Crawler(htmlcode) - r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href') - if r18_continue and r18_continue.startswith('http'): - htmlcode = session.get(r18_continue).text - gcolle_crawler = Crawler(htmlcode) - save_cookies = True - cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True) - - number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()') - if number != number_html: - raise Exception('[-]gcolle.py: number not match') - - if save_cookies: - cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}" - cookies_save.parent.mkdir(parents=True, exist_ok=True) - cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') - - # get extrafanart url - if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src') - else: - extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src') - # Add "https:" in each extrafanart url - for i in range(len(extrafanart)): - extrafanart[i] = 'https:' + extrafanart[i] - - dic = { - "title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(), - "studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'), - "runtime": '', - "director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0], - "number": "GCOLLE-" + str(number_html), - "cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'), - "trailer": '', - "actor_photo":'', - "imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面 - "tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'), - "extrafanart":extrafanart, - "label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - "website": 'https://gcolle.net/product_info.php/products_id/' + number, - "source": 'gcolle.py', - "series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'), - '无码': False, - } - # for k,v in dic.items(): - # if k == 'outline': - # print(k,len(v)) - # else: - # print(k,v) - # print('===============================================================') - except Exception as e: - dic = {'title':''} - if config.getInstance().debug(): - print(e) - - return dic - -if __name__ == '__main__': - from pprint import pprint - config.getInstance().set_override("debug_mode:switch=1") - pprint(main('840724')) - pprint(main('840386')) - pprint(main('838671')) - pprint(main('814179')) - pprint(main('834255')) - pprint(main('814179')) diff --git a/WebCrawler/getchu.py b/WebCrawler/getchu.py deleted file mode 100644 index 2d3a699..0000000 --- a/WebCrawler/getchu.py +++ /dev/null @@ -1,133 +0,0 @@ -import sys -sys.path.append('../') -from ADC_function import * -from WebCrawler.crawler import * -import re -import time -from urllib.parse import quote - -JSON_HEADERS = {"Referer": "https://dl.getchu.com/"} -COOKIES_DL = {"adult_check_flag": "1"} -COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'} - -GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' -GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1' -GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_' -GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_' - -def get_dl_getchu(number): - if "item" in number or 'GETCHU' in number.upper(): - number = re.findall('\d+',number)[0] - else: - htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number), - json_headers=JSON_HEADERS, cookies=COOKIES_DL) - getchu = Crawler(htmlcode) - url = getchu.getString( - '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href') - if url == "": - return None - number = re.findall('\d+', url)[0] - htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL) - getchu = Crawler(htmlcode) - dic = { - "title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"), - "cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"), - "director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(), - "studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(), - "actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(), - "label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(), - "runtime": str(re.findall('\d+', str(getchu.getString( - "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"), - "release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"), - "tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"), - "outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"), - "extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"), - "series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"), - "number": 'GETCHU-' + re.findall('\d+',number)[0], - "imagecut": 4, - "year": str(re.findall('\d{4}', str(getchu.getString( - "//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"), - "actor_photo": "", - "website": "https://dl.getchu.com/i/" + number, - "source": "getchu.py", - "allow_number_change": True, - } - extrafanart = [] - for i in dic['extrafanart']: - i = "https://dl.getchu.com" + i - extrafanart.append(i) - dic['extrafanart'] = extrafanart - time.sleep(1) - return dic - -def get_www_getchu(number): - number = quote(number, encoding="euc_jp") - getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW)) - url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') - if url2 == '': - getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW)) - url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') - if url2 == "": - return None - url2 = url2.replace('../', 'http://www.getchu.com/') - getchu = Crawler(get_html(url2, cookies=COOKIES_WWW)) - dic = { - "title": getchu.getString('//*[@id="soft-title"]/text()').strip(), - "cover": "http://www.getchu.com" + getchu.getString( - "/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'), - "director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"), - "studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(), - "actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(), - "label": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(), - "runtime": '', - "release": getchu.getString("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-").strip(), - "tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"), - "outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"), - "extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"), - "series": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(), - "number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0], - "imagecut": 0, - "year": str(re.findall('\d{4}', str(getchu.getString( - "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"), - "actor_photo": "", - "website": url2, - "headers": {'referer': url2}, - "source": "getchu.py", - "allow_number_change": True, - } - extrafanart = [] - for i in dic['extrafanart']: - i = "http://www.getchu.com" + i.replace("./", '/') - if 'jpg' in i: - extrafanart.append(i) - dic['extrafanart'] = extrafanart - time.sleep(1) - return dic - -def main(number): - number = number.replace("-C", "") - dic = {} - if "item" in number: - sort = ["get_dl_getchu(number)", "get_www_getchu(number)"] - else: - sort = ["get_www_getchu(number)", "get_dl_getchu(number)"] - for i in sort: - dic = eval(i) - if dic != None: - break - if dic == None: - return {"title" : ""} - outline = '' - _list = dic['outline'] - for i in _list: - outline = outline + i - dic['outline'] = outline - - result = json.dumps(dic,ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) - return result - -if __name__ == '__main__': - test = [] - for i in test: - print(i) - print(main(i)) diff --git a/WebCrawler/jav321.py b/WebCrawler/jav321.py deleted file mode 100644 index 223e862..0000000 --- a/WebCrawler/jav321.py +++ /dev/null @@ -1,185 +0,0 @@ -import sys -sys.path.append('../') -import json -from bs4 import BeautifulSoup -from lxml import html -from ADC_function import post_html -import re - - -def main(number: str) -> json: - try: - result = post_html(url="https://www.jav321.com/search", query={"sn": number}) - soup = BeautifulSoup(result.text, "html.parser") - lx = html.fromstring(str(soup)) - except: - dic = {"title": ""} - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - if "/video/" in result.url: - data = parse_info(soup) - - dic = { - "title": get_title(lx), - "year": get_year(data), - "outline": get_outline(lx), - "director": "", - "cover": get_cover(lx), - "imagecut": 1, - "trailer": get_trailer(result.text), - "extrafanart": get_extrafanart(result.text), - "actor_photo": "", - "website": result.url, - "source": "jav321.py", - **data, - } - else: - dic = {"title": ""} - - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - -def get_title(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip() - - -def parse_info(soup: BeautifulSoup) -> dict: - data = soup.select_one("div.row > div.col-md-9") - - if data: - dd = str(data).split("<br/>") - data_dic = {} - for d in dd: - data_dic[get_bold_text(h=d)] = d - - return { - "actor": get_actor(data_dic), - "label": get_label(data_dic), - "studio": get_studio(data_dic), - "tag": get_tag(data_dic), - "number": get_number(data_dic).upper(), - "release": get_release(data_dic), - "runtime": get_runtime(data_dic).replace(" minutes", ""), - "series": get_series(data_dic), - } - else: - return {"title": ""} - - -def get_bold_text(h: str) -> str: - soup = BeautifulSoup(h, "html.parser") - if soup.b: - return soup.b.text - else: - return "UNKNOWN_TAG" - - -def get_anchor_info(h: str) -> str: - result = [] - - data = BeautifulSoup(h, "html.parser").find_all("a", href=True) - for d in data: - result.append(d.text) - - return ",".join(result) - - -def get_text_info(h: str) -> str: - return h.split(": ")[1] - -def get_trailer(html) -> str: - videourl_pather = re.compile(r'<source src=\"(.*?)\"') - videourl = videourl_pather.findall(html) - if videourl: - url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp') - return url - else: - return '' - -def get_extrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return extrafanart_imgs - return '' - -def get_cover(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0] - - -def get_outline(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] - -def get_series2(lx: html.HtmlElement) -> str: - return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0] - - -def get_actor(data: hash) -> str: - if "出演者" in data: - return get_anchor_info(data["出演者"]) - else: - return "" - - -def get_label(data: hash) -> str: - if "メーカー" in data: - return get_anchor_info(data["メーカー"]) - else: - return "" - - -def get_tag(data: hash) -> str: - if "ジャンル" in data: - return get_anchor_info(data["ジャンル"]) - else: - return "" - - - -def get_studio(data: hash) -> str: - if "メーカー" in data: - return get_anchor_info(data["メーカー"]) - else: - return "" - - -def get_number(data: hash) -> str: - if "品番" in data: - return get_text_info(data["品番"]) - else: - return "" - - -def get_release(data: hash) -> str: - if "配信開始日" in data: - return get_text_info(data["配信開始日"]) - else: - return "" - - -def get_runtime(data: hash) -> str: - if "収録時間" in data: - return get_text_info(data["収録時間"]) - else: - return "" - - -def get_year(data: hash) -> str: - if "release" in data: - return data["release"][:4] - else: - return "" - - -def get_series(data: hash) -> str: - if "シリーズ" in data: - return get_anchor_info(data["シリーズ"]) - else: - return "" - - -if __name__ == "__main__": - print(main("jul-404")) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py deleted file mode 100644 index 515f387..0000000 --- a/WebCrawler/javbus.py +++ /dev/null @@ -1,184 +0,0 @@ -import sys -sys.path.append('../') -from ADC_function import * -from WebCrawler.storyline import getStoryline -import inspect - -def getActorPhoto(html): - actors = html.xpath('//div[@class="star-name"]/../a/img') - d = {} - for i in actors: - p = i.attrib['src'] - if "nowprinting.gif" in p: - continue - t = i.attrib['title'] - d[t] = urljoin("https://www.javbus.com", p) - return d -def getTitle(html): #获取标题 - title = str(html.xpath('/html/head/title/text()')[0]) - title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip() - return title -def getStudioJa(html): - x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getStudio(html): #获取厂商 - x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getYear(html): #获取年份 - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip() - return result[:4] if len(result)>=len('2000-01-01') else '' -def getCover(html): #获取封面链接 - image = str(html.xpath('//a[@class="bigImage"]/@href')[0]) - return urljoin("https://www.javbus.com", image) -def getRelease(html): #获取出版日期 - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(html): #获取分钟 已修改 - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") - return result -def getActor(html): #获取女优 - b=[] - actors = html.xpath('//div[@class="star-name"]/a') - for i in actors: - b.append(i.attrib['title']) - return b -def getNum(html): #获取番号 - kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return kwdlist[0] -def getDirectorJa(html): - x = html.xpath('//span[contains(text(),"監督:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getDirector(html): #获取导演 - x = html.xpath('//span[contains(text(),"導演:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getCID(html): - string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') - result = re.sub('/.*?.jpg','',string) - return result -def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询 - if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): - return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 - return getStoryline(number,title, 无码=uncensored) -def getSeriseJa(html): - x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getSerise(html): #获取系列 - x = html.xpath('//span[contains(text(),"系列:")]/../a/text()') - return str(x[0]) if len(x) else '' -def getTag(html): # 获取标签 - klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return klist[1:] -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs] - return '' -def getUncensored(html): - x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]') - return bool(x) - -def main_uncensored(number): - w_number = number.replace('.', '-') - htmlcode = get_html('https://www.javbus.red/' + w_number) - if "<title>404 Page Not Found" in htmlcode: - return {"title": ""} - lx = etree.fromstring(htmlcode, etree.HTMLParser()) - title = getTitle(lx) - dic = { - 'title': title, - 'studio': getStudioJa(lx), - 'year': getYear(lx), - 'outline': getOutline(w_number, title, True), - 'runtime': getRuntime(lx), - 'director': getDirectorJa(lx), - 'actor': getActor(lx), - 'release': getRelease(lx), - 'number': getNum(lx), - 'cover': getCover(lx), - 'tag': getTag(lx), - 'extrafanart': getExtrafanart(htmlcode), - 'label': getSeriseJa(lx), - 'imagecut': 0, - 'actor_photo': getActorPhoto(lx), - 'website': 'https://www.javbus.red/' + w_number, - 'source': 'javbus.py', - 'series': getSeriseJa(lx), - '无码': True - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - - -def main(number): - try: - try: - url = "https://www." + secrets.choice([ - 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', - 'cdnbus.fun', - 'dmmbus.fun', 'dmmsee.fun', - 'fanbus.us', - 'seedmm.fun', - ]) + "/" - try: - htmlcode = get_html(url + number) - except: - htmlcode = get_html('https://www.javbus.com/' + number) - if "<title>404 Page Not Found" in htmlcode: - return {"title": ""} - lx = etree.fromstring(htmlcode,etree.HTMLParser()) - title = getTitle(lx) - dic = { - 'title': title, - 'studio': getStudio(lx), - 'year': getYear(lx), - 'outline': getOutline(number, title, getUncensored(lx)), - 'runtime': getRuntime(lx), - 'director': getDirector(lx), - 'actor': getActor(lx), - 'release': getRelease(lx), - 'number': getNum(lx), - 'cover': getCover(lx), - 'imagecut': 1, - 'tag': getTag(lx), - 'extrafanart': getExtrafanart(htmlcode), - 'label': getSerise(lx), - 'actor_photo': getActorPhoto(lx), - 'website': 'https://www.javbus.com/' + number, - 'source': 'javbus.py', - 'series': getSerise(lx), - '无码': getUncensored(lx) - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') - return js - except: - return main_uncensored(number) - except Exception as e: - if config.getInstance().debug(): - print(e) - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) - return js - -if __name__ == "__main__" : - config.getInstance().set_override("storyline:switch=0") - config.getInstance().set_override("actor_photo:download_for_kodi=1") - config.getInstance().set_override("debug_mode:switch=1") - print(main('STAR-438')) - print(main('ABP-960')) - print(main('ADV-R0624')) # 404 - print(main('MMNT-010')) - print(main('ipx-292')) - print(main('CEMD-011')) - print(main('CJOD-278')) - print(main('BrazzersExxtra.21.02.01')) - print(main('100221_001')) - print(main('AVSW-061')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py deleted file mode 100755 index 6529dd6..0000000 --- a/WebCrawler/javdb.py +++ /dev/null @@ -1,321 +0,0 @@ -import sys -sys.path.append('../') -from ADC_function import * -from WebCrawler.storyline import getStoryline - -def getTitle(html): - browser_title = str(html.xpath("/html/head/title/text()")[0]) - return browser_title[:browser_title.find(' | JavDB')].strip() - -def getActor(html): - actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') - genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') - r = [] - idx = 0 - actor_gendor = config.getInstance().actor_gender() - if not actor_gendor in ['female','male','both','all']: - actor_gendor = 'female' - for act in actors: - if((actor_gendor == 'all') - or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male']) - or (actor_gendor == 'female' and genders[idx] == 'symbol female') - or (actor_gendor == 'male' and genders[idx] == 'symbol male')): - r.append(act) - idx = idx + 1 - return r - -def getaphoto(url, session): - html_page = session.get(url).text - img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page) - return img_url[0] if img_url else '' - -def getActorPhoto(html, javdb_site, session): - actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]') - if not actorall: - return {} - a = getActor(html) - actor_photo = {} - if not session: - session = get_html_session() - for i in actorall: - x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A) - if not len(x) or not len(x[0]) or i.text not in a: - continue - actor_id = x[0] - pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg" - if not session.head(pic_url).ok: - pic_url = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session) - if len(pic_url): - actor_photo[i.text] = pic_url - return actor_photo - -def getStudio(a, html): - patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>') - pianshang = patherr.findall(a) - if pianshang: - result = pianshang[0].strip() - if len(result): - return result - # 以卖家作为工作室 - try: - result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") - except: - result = '' - return result - -def getRuntime(html): - result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(html): - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getNum(html): - result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") - return str(result2 + result1).strip('+') -def getYear(getRelease): - patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)\-.*?</span>') - dates = patherr.findall(getRelease) - if dates: - result = dates[0] - else: - result = '' - return result - -def getRelease(a): - patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)</span>') - dates = patherr.findall(a) - if dates: - result = dates[0] - else: - result = '' - return result -def getTag(html): - try: - result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') - return result - - except: - result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') - return result - -def getCover_small(html, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - try: - result = html.xpath("//*[contains(@class,'movie-list')]/div/a/div[contains(@class, 'cover')]/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result - except: # 2020.7.17 Repair Cover Url crawl - try: - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result - except: - result = html.xpath("//div[@class='item-image']/img/@data-src")[index] - if not 'https' in result: - result = 'https:' + result - return result - - -def getTrailer(htmlcode): # 获取预告片 - video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"') - video = video_pather.findall(htmlcode) - # 加上数组判空 - if video and video[0] != "": - if not 'https:' in video[0]: - video_url = 'https:' + video[0] - else: - video_url = video[0] - else: - video_url = '' - return video_url - -def getExtrafanart(html): # 获取剧照 - result = [] - try: - result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") - except: - pass - return result -def getCover(html): - try: - result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] - except: # 2020.7.17 Repair Cover Url crawl - result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] - return result -def getDirector(html): - result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询 - return getStoryline(number, title, 无码=uncensored) -def getSeries(html): - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getUserRating(html): - try: - result = str(html.xpath('//span[@class="score-stars"]/../text()')[0]) - v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) - return float(v[0][0]), int(v[0][1]) - except: - return -def getUncensored(html): - x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")' - ' or contains(@href,"/tags/western?")]') - return bool(x) - -def main(number): - # javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点, - # 如果无.json文件或者超过有效期,则随机选择一个站点。 - javdb_sites = config.getInstance().javdb_sites().split(',') - debug = config.getInstance().debug() - for i in javdb_sites: - javdb_sites[javdb_sites.index(i)] = "javdb" + i - javdb_sites.append("javdb") - try: - # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): - # pass - # else: - # number = number.upper() - number = number.upper() - javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} - # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 - has_json = False - for cj in javdb_sites: - javdb_site = cj - cookie_json = javdb_site + '.json' - cookies_dict, cookies_filepath = load_cookies(cookie_json) - if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): - cdays = file_modification_days(cookies_filepath) - if cdays < 7: - javdb_cookies = cookies_dict - has_json = True - break - elif cdays != 9999: - print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') - if not has_json: - javdb_site = secrets.choice(javdb_sites) - if debug: - print(f'[!]javdb:select site {javdb_site}') - session = None - javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' - try: - if debug: - raise # try get_html_by_scraper() branch - res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session') - if not res: - raise - query_result = res.text - except: - res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper') - if not res: - raise ValueError('page not found') - query_result = res.text - if session is None: - raise ValueError('page not found') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - # javdb sometime returns multiple results, - # and the first elememt maybe not the one we are looking for - # iterate all candidates and find the match one - urls = html.xpath('//*[contains(@class,"movie-list")]/div/a/@href') - # 记录一下欧美的ids ['Blacked','Blacked'] - if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): - correct_url = urls[0] - else: - ids = html.xpath('//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') - try: - correct_url = urls[ids.index(number)] - except: - # 为避免获得错误番号,只要精确对应的结果 - if ids[0].upper() != number: - raise ValueError("number not found") - correct_url = urls[0] - try: - # get faster benefit from http keep-alive - javdb_detail_url = urljoin(res.url, correct_url) - detail_page = session.get(javdb_detail_url).text - except: - detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) - session = None - - # etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用 - lx = etree.fromstring(detail_page, etree.HTMLParser()) - imagecut = 1 - dp_number = getNum(lx) - if dp_number.upper() != number.upper(): - raise ValueError("number not eq"+dp_number) - title = getTitle(lx) - if title and dp_number: - number = dp_number - # remove duplicate title - title = title.replace(number, '').strip() - dic = { - 'actor': getActor(lx), - 'title': title, - 'studio': getStudio(detail_page, lx), - 'outline': getOutline(number, title, getUncensored(lx)), - 'runtime': getRuntime(lx), - 'director': getDirector(lx), - 'release': getRelease(detail_page), - 'number': number, - 'cover': getCover(lx), - 'trailer': getTrailer(detail_page), - 'extrafanart': getExtrafanart(lx), - 'imagecut': imagecut, - 'tag': getTag(lx), - 'label': getLabel(lx), - 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), - 'website': urljoin('https://javdb.com', correct_url), - 'source': 'javdb.py', - 'series': getSeries(lx), - '无码': getUncensored(lx) - } - userrating = getUserRating(lx) - if isinstance(userrating, tuple) and len(userrating) == 2: - dic['用户评分'] = userrating[0] - dic['评分人数'] = userrating[1] - if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): - dic['actor'].append('素人') - if not dic['series']: - dic['series'] = dic['studio'] - if not dic['label']: - dic['label'] = dic['studio'] - if config.getInstance().download_actor_photo_for_kodi(): - dic['actor_photo'] = getActorPhoto(lx, javdb_site, session) - - - except Exception as e: - if debug: - print(e) - dic = {"title": ""} - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - -# main('DV-1562') -# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") -if __name__ == "__main__": - config.getInstance().set_override("storyline:switch=0") - config.getInstance().set_override("actor_photo:download_for_kodi=1") - config.getInstance().set_override("debug_mode:switch=1") - # print(main('blacked.20.05.30')) - print(main('AGAV-042')) - print(main('BANK-022')) - print(main('070116-197')) - print(main('093021_539')) # 没有剧照 片商pacopacomama - #print(main('FC2-2278260')) - # print(main('FC2-735670')) - # print(main('FC2-1174949')) # not found - print(main('MVSD-439')) - # print(main('EHM0001')) # not found - #print(main('FC2-2314275')) - print(main('EBOD-646')) - print(main('LOVE-262')) - print(main('ABP-890')) - print(main('blacked.14.12.08')) diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py deleted file mode 100644 index 538fc19..0000000 --- a/WebCrawler/javlib.py +++ /dev/null @@ -1,161 +0,0 @@ -import sys -sys.path.append('../') -import json -import bs4 -import re -from WebCrawler import airav -from bs4 import BeautifulSoup -from lxml import html -from http.cookies import SimpleCookie - -from ADC_function import get_javlib_cookie, get_html - - -def main(number: str): - raw_cookies, user_agent = get_javlib_cookie() - - # Blank cookies mean javlib site return error - if not raw_cookies: - return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - # Manually construct a dictionary - s_cookie = SimpleCookie() - s_cookie.load(raw_cookies) - cookies = {} - for key, morsel in s_cookie.items(): - cookies[key] = morsel.value - - # Scraping - result = get_html( - "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number), - cookies=cookies, - ua=user_agent, - return_type="object" - ) - soup = BeautifulSoup(result.text, "html.parser") - lx = html.fromstring(str(soup)) - - fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>') - fanhao = fanhao_pather.findall(result.text) - - if "/?v=jav" in result.url: - dic = { - "title": get_title(lx, soup), - "studio": get_table_el_single_anchor(soup, "video_maker"), - "year": get_table_el_td(soup, "video_date")[:4], - "outline": get_outline(number), - "director": get_table_el_single_anchor(soup, "video_director"), - "cover": get_cover(lx), - "imagecut": 1, - "actor_photo": "", - "website": result.url, - "source": "javlib.py", - "actor": get_table_el_multi_anchor(soup, "video_cast"), - "label": get_table_el_td(soup, "video_label"), - "tag": get_table_el_multi_anchor(soup, "video_genres"), - "number": get_table_el_td(soup, "video_id"), - "release": get_table_el_td(soup, "video_date"), - "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), - "series":'', - } - elif number.upper() in fanhao: - url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>') - s = {} - url_list = url_pather.findall(result.text) - for url in url_list: - s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.') - av_url = s[number.upper()] - result = get_html( - av_url, - cookies=cookies, - ua=user_agent, - return_type="object" - ) - soup = BeautifulSoup(result.text, "html.parser") - lx = html.fromstring(str(soup)) - - dic = { - "title": get_title(lx, soup), - "studio": get_table_el_single_anchor(soup, "video_maker"), - "year": get_table_el_td(soup, "video_date")[:4], - "outline": get_outline(number), - "director": get_table_el_single_anchor(soup, "video_director"), - "cover": get_cover(lx), - "imagecut": 1, - "actor_photo": "", - "website": result.url, - "source": "javlib.py", - "actor": get_table_el_multi_anchor(soup, "video_cast"), - "label": get_table_el_td(soup, "video_label"), - "tag": get_table_el_multi_anchor(soup, "video_genres"), - "number": get_table_el_td(soup, "video_id"), - "release": get_table_el_td(soup, "video_date"), - "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), - "series": '', - } - else: - dic = {"title": ""} - - return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - - -def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: - return lx.xpath(xpath)[0].strip() - - -def get_outline(number): - try: - response = json.loads(airav.main(number)) - result = response['outline'] - return result - except: - return '' - - -def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: - tag = soup.find(id=tag_id).find("a") - - if tag is not None: - return tag.string.strip() - else: - return "" - - -def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str: - tags = soup.find(id=tag_id).find_all("a") - - return process(tags) - - -def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str: - tags = soup.find(id=tag_id).find_all("td", class_="text") - - return process(tags) - - -def process(tags: bs4.element.ResultSet) -> str: - values = [] - for tag in tags: - value = tag.string - if value is not None and value != "----": - values.append(value) - - return ",".join(x for x in values if x) - - -def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str: - title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()') - number = get_table_el_td(soup, "video_id") - - return title.replace(number, "").strip() - - -def get_cover(lx: html.HtmlComment) -> str: - return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src')) - - -if __name__ == "__main__": - lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"] - #lists = ["DVMC-003"] - for num in lists: - print(main(num)) diff --git a/WebCrawler/madou.py b/WebCrawler/madou.py deleted file mode 100644 index 937fda3..0000000 --- a/WebCrawler/madou.py +++ /dev/null @@ -1,173 +0,0 @@ -import sys -sys.path.append('../') -from ADC_function import * -import json -import re -from lib2to3.pgen2 import parse - -from urllib.parse import urlparse, unquote - - -def getActorPhoto(html): - return '' - - -def getTitle(html): # 获取标题 - # <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社 - # MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社 - # MD0094/贫嘴贱舌中出大嫂/坏嫂嫂和小叔偷腥内射受孕-麻豆社 - # TM0002-我的痴女女友-麻豆社 - browser_title = str(html.xpath("/html/head/title/text()")[0]) - title = str(re.findall(r'^[A-Z0-9 //\-]*(.*)-麻豆社$', browser_title)[0]).strip() - return title - -def getStudio(html): # 获取厂商 已修改 - try: - category = str(html.xpath('//a[@rel="category tag"]/text()')[0]) - return category.strip() - except: - return '麻豆社' - - -def getYear(html): # 获取年份 - return '' - - -def getCover(htmlcode): # 获取封面图片 - try: - url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0]) - return url.strip() - except: - return '' - - -def getRelease(html): # 获取出版日期 - return '' - - -def getRuntime(html): # 获取播放时长 - return '' - -def getUrl(html): - return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0]) - - -def getNum(url, number): # 获取番号 - try: - # 解码url - filename = unquote(urlparse(url).path) - # 裁剪文件名 - result = filename[1:-5].upper().strip() - # 移除中文 - if result.upper() != number.upper(): - result = re.split(r'[^\x00-\x7F]+', result, 1)[0] - # 移除多余的符号 - return result.strip('-') - except: - return '' - - -def getDirector(html): # 获取导演 已修改 - return '' - - -def getOutline(html): # 获取概述 - return '' - - -def getSerise(html): # 获取系列 已修改 - return '' - - -def getTag(html, studio): # 获取标签 - x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] - - -def getExtrafanart(html): # 获取剧照 - return '' - - -def cutTags(tags): - actors = [] - tags = [] - for tag in tags: - actors.append(tag) - return actors,tags - - -def main(number): - try: - try: - number = number.lower().strip() - url = "https://madou.club/" + number + ".html" - htmlcode = get_html(url) - except: - # print(number) - pass - - html = etree.fromstring(htmlcode, etree.HTMLParser()) - url = getUrl(html) - studio = getStudio(html) - tags = getTag(html, studio) - #actor,tags = cutTags(tags) # 演员在tags中的位置不固定,放弃尝试获取 - actor = '' - dic = { - # 标题 - 'title': getTitle(html), - # 制作商 - 'studio': studio, - # 年份 - 'year': getYear(html), - # 简介 - 'outline': getOutline(html), - # - 'runtime': getRuntime(html), - # 导演 - 'director': getDirector(html), - # 演员 - 'actor': actor, - # 发售日 - 'release': getRelease(html), - # 番号 - 'number': getNum(url, number), - # 封面链接 - 'cover': getCover(htmlcode), - # 剧照获取 - 'extrafanart': getExtrafanart(html), - 'imagecut': 1, - # - 'tag': tags, - # - 'label': getSerise(html), - # 作者图片 - 'website': url, - 'source': 'madou.py', - # 使用 - 'series': getSerise(html), - '无码': True - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, - indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js - except Exception as e: - if config.getInstance().debug(): - print(e) - data = { - "title": "", - } - js = json.dumps( - data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") - ) - return js - - -if __name__ == '__main__': - config.getInstance().set_override("debug_mode:switch=1") - print(main('MD0129')) - # print(main('TM0002')) - # print(main('MD0222')) - # print(main('MD0140-2')) - # print(main('MAD039')) - # print(main('JDMY027')) - diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py deleted file mode 100644 index bb344f9..0000000 --- a/WebCrawler/mgstage.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys -sys.path.append('../') -from bs4 import BeautifulSoup -from ADC_function import * -from WebCrawler.crawler import * - -class MgsCrawler(Crawler): - def getMgsString(self, _xpath): - html = self.html - result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '') - result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n') - return str(result1 + result2).strip('+').replace("', '",'').replace('"','') - -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') - return result - -def getExtrafanart(htmlcode2): # 获取剧照 - html_pather = re.compile(r'
\s*?\s*?
') - html = html_pather.search(htmlcode2) - if html: - html = html.group() - extrafanart_pather = re.compile(r' div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = [] - for i in htmla: - t.append(i.text.strip()) - return t - - -def getActorPhoto(browser): - htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = {i.text.strip(): i['href'] for i in htmla} - o = {} - for k, v in t.items(): - r = browser.open_relative(v) - if not r.ok: - continue - pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') - if 'noimage.gif' in pic['src']: - continue - o[k] = urljoin(browser.url, pic['src']) - return o - - -def getStudio(html): - try: - result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") - except: - result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") - return result.strip('+').replace("', '", '').replace('"', '') - - -def getRuntime(html): - try: - x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip() - return x - except: - return '' - -def getLabel(html): - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] - return result - except: - return '' - - -def getNum(html): - try: - result = html.xpath('//*[@id="hinban"]/text()')[0] - return result - except: - return '' - - -def getYear(getRelease): - try: - result = str(re.search('\d{4}', getRelease).group()) - return result - except: - return getRelease - - -def getRelease(html): - try: - result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) - except: - return '' - try: - return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') - except: - return '' - - -def getTag(html): - result = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()') - total = [] - for i in result: - total.append(i.replace("\n","").replace("\t","")) - return total - - -def getCover_small(html, index=0): - # same issue mentioned below, - # javdb sometime returns multiple results - # DO NOT just get the firt one, get the one with correct index number - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result - - -def getCover(html): - try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] - return 'https:' + result - except: - return '' - - -def getDirector(html): - try: - result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') - return result - except: - return '' - - -def getOutline(html, number, title): - storyline_site = config.getInstance().storyline_site().split(',') - a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 - if len(a): - site = [n for n in storyline_site if n in a] - g = getStoryline(number, title, site, 无码=False) - if len(g): - return g - try: - x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0] - return x.replace(getNum(html), '') - except: - return '' - -def getSeries(html): - try: - try: - result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] - return result - except: - result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0] - return result - except: - return '' - -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'
[\s\S]*?
') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r' bool: # 元数据获取失败检测 - if "title" not in data or "number" not in data: - return False - - if data["title"] is None or data["title"] == "" or data["title"] == "null": - return False - - if data["number"] is None or data["number"] == "" or data["number"] == "null": - return False - - return True - +from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate +from scrapinglib.api import search def get_data_from_json(file_number, oCC): """ @@ -49,116 +15,45 @@ def get_data_from_json(file_number, oCC): actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml')) info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml')) - func_mapping = { - "airav": airav.main, - "avsox": avsox.main, - "fc2": fc2.main, - "fanza": fanza.main, - "javdb": javdb.main, - "javbus": javbus.main, - "mgstage": mgstage.main, - "jav321": jav321.main, - "xcity": xcity.main, - # "javlib": javlib.main, - "dlsite": dlsite.main, - "carib": carib.main, - "fc2club": fc2club.main, - "mv91": mv91.main, - "madou": madou.main, - "gcolle": gcolle.main, - "getchu": getchu.main, - } - conf = config.getInstance() # default fetch order list, from the beginning to the end - sources = conf.sources().split(',') - def insert(sources,source): - if source in sources: - sources.insert(0, sources.pop(sources.index(source))) - return sources + sources = conf.sources() - if len(sources) <= len(func_mapping): - # if the input file name matches certain rules, - # move some web service to the beginning of the list - lo_file_number = file_number.lower() - if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number) - ): - sources = insert(sources,"carib") - elif "item" in file_number or "GETCHU" in file_number.upper(): - sources = insert(sources,"getchu") - elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number): - sources = insert(sources, "getchu") - sources = insert(sources, "dlsite") - elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number: - if "avsox" in sources: - sources = insert(sources,"avsox") - elif "mgstage" in sources and \ - (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number): - sources = insert(sources,"mgstage") - elif "fc2" in lo_file_number: - if "fc2" in sources: - sources = insert(sources,"fc2") - elif "gcolle" in sources and (re.search("\d{6}", file_number)): - sources = insert(sources,"gcolle") - elif re.search(r"^[a-z0-9]{3,}$", lo_file_number): - if "xcity" in sources: - sources = insert(sources,"xcity") - if "madou" in sources: - sources = insert(sources,"madou") - elif "madou" in sources and ( - re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) - ): - sources = insert(sources,"madou") - - # check sources in func_mapping - todel = [] - for s in sources: - if not s in func_mapping: - print('[!] Source Not Exist : ' + s) - todel.append(s) - for d in todel: - print('[!] Remove Source : ' + s) - sources.remove(d) - - json_data = {} - - if conf.multi_threading(): - pool = ThreadPool(processes=len(conf.sources().split(','))) - - # Set the priority of multi-thread crawling and join the multi-thread queue - for source in sources: - pool.apply_async(func_mapping[source], (file_number,)) - - # Get multi-threaded crawling response - for source in sources: - if conf.debug() == True: - print('[+]select', source) - try: - json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) - except: - json_data = pool.apply_async(func_mapping[source], (file_number,)).get() - # if any service return a valid return, break - if get_data_state(json_data): - print(f"[+]Find movie [{file_number}] metadata on website '{source}'") + # TODO 准备参数 + # - 清理 ADC_function, webcrawler + proxies = None + configProxy = conf.proxy() + if configProxy.enable: + proxies = configProxy.proxies() + + javdb_sites = conf.javdb_sites().split(',') + for i in javdb_sites: + javdb_sites[javdb_sites.index(i)] = "javdb" + i + javdb_sites.append("javdb") + # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 + has_json = False + for cj in javdb_sites: + javdb_site = cj + cookie_json = javdb_site + '.json' + cookies_dict, cookies_filepath = load_cookies(cookie_json) + if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): + cdays = file_modification_days(cookies_filepath) + if cdays < 7: + javdb_cookies = cookies_dict + has_json = True break - pool.close() - pool.terminate() - else: - for source in sources: - try: - if conf.debug() == True: - print('[+]select', source) - try: - json_data = json.loads(func_mapping[source](file_number)) - except: - json_data = func_mapping[source](file_number) - # if any service return a valid return, break - if get_data_state(json_data): - print(f"[+]Find movie [{file_number}] metadata on website '{source}'") - break - except: - continue + elif cdays != 9999: + print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') + if not has_json: + javdb_site = secrets.choice(javdb_sites) + javdb_cookies = None + cacert =None + if conf.cacert_file(): + cacert = conf.cacert_file() + json_data = search(file_number, sources, proxies=proxies, verify=cacert, + dbsite=javdb_site, dbcookies=javdb_cookies, + morestoryline=conf.is_storyline()) # Return if data not found in all sources if not json_data: print('[-]Movie Number not found!') @@ -316,26 +211,26 @@ def get_data_from_json(file_number, oCC): try: if ccm == 1: json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_list("删除", json_data[cc]) elif ccm == 2: json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_list("删除", json_data[cc]) elif ccm == 3: json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_list("删除", json_data[cc]) except: json_data[cc] = [oCC.convert(t) for t in json_data[cc]] else: try: if ccm == 1: json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) elif ccm == 2: json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) elif ccm == 3: json_data[cc] = convert(info_mapping_data, "jp", json_data[cc]) - json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) except IndexError: json_data[cc] = oCC.convert(json_data[cc]) except: diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py new file mode 100644 index 0000000..9f2e761 --- /dev/null +++ b/scrapinglib/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +from .api import search diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py new file mode 100644 index 0000000..1d36805 --- /dev/null +++ b/scrapinglib/airav.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +import json +import re +from lxml import etree +from .parser import Parser +from .javbus import Javbus + +class Airav(Parser): + source = 'airav' + + expr_title = '/html/head/title/text()' + expr_number = '/html/head/title/text()' + expr_studio = '//a[contains(@href,"?video_factory=")]/text()' + expr_release = '//li[contains(text(),"發片日期")]/text()' + expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)" + expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()' + expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src' + expr_tags = '//div[@class="tagBtnMargin"]/a/text()' + expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href' + + def search(self, number): + self.number = number + self.detailurl = 'https://cn.airav.wiki/video/' + number + engine = Javbus() + javbusinfo = engine.scrape(number, self) + if javbusinfo == 404: + self.javbus = {"title": ""} + else: + self.javbus = json.loads(javbusinfo) + self.htmlcode = self.getHtml(self.detailurl) + htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + # return super().getNum(htmltree) + result = self.javbus.get('number') + if isinstance(result, str) and len(result): + return result + number = super().getNum(htmltree) + result = str(re.findall('^\[(.*?)]', number)[0]) + return result + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() + return result + + def getStudio(self, htmltree): + result = self.javbus.get('studio') + if isinstance(result, str) and len(result): + return result + return super().getStudio(htmltree) + + def getRelease(self, htmltree): + result = self.javbus.get('release') + if isinstance(result, str) and len(result): + return result + try: + return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group() + except: + return '' + + def getYear(self, htmltree): + result = self.javbus.get('year') + if isinstance(result, str) and len(result): + return result + release = self.getRelease(htmltree) + return str(re.findall('\d{4}', release)).strip(" ['']") + + def getOutline(self, htmltree): + return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip() + + def getRuntime(self, htmltree): + result = self.javbus.get('runtime') + if isinstance(result, str) and len(result): + return result + return '' + + def getDirector(self, htmltree): + result = self.javbus.get('director') + if isinstance(result, str) and len(result): + return result + return '' + + def getActors(self, htmltree): + b=[] + a = super().getActors(htmltree) + for v in a: + v = v.strip() + if len(v): + b.append(v) + if len(b): + return b + result = self.javbus.get('actor') + if isinstance(result, list) and len(result): + return result + return [] + + def getCover(self, htmltree): + result = self.javbus.get('cover') + if isinstance(result, str) and len(result): + return result + return super().getCover(htmltree) + + def getTags(self, htmltree): + return self.getTreeAll(htmltree, self.expr_tags) + + def getSeries(self, htmltree): + result = self.javbus.get('series') + if isinstance(result, str) and len(result): + return result + return '' diff --git a/scrapinglib/api.py b/scrapinglib/api.py new file mode 100644 index 0000000..c8c4679 --- /dev/null +++ b/scrapinglib/api.py @@ -0,0 +1,225 @@ +# -*- coding: utf-8 -*- + +import re +import json + +from .airav import Airav +from .carib import Carib +from .dlsite import Dlsite +from .fanza import Fanza +from .gcolle import Gcolle +from .getchu import Getchu +from .jav321 import Jav321 +from .javdb import Javdb +from .mv91 import Mv91 +from .fc2 import Fc2 +from .madou import Madou +from .mgstage import Mgstage +from .javbus import Javbus +from .xcity import Xcity +from .avsox import Avsox + +from .tmdb import Tmdb + + +def search(number, sources: str=None, proxies=None, verify=None, type='adult', + dbcookies=None, dbsite=None, morestoryline=False): + """ 根据``番号/电影``名搜索信息 + + :param number: number/name depends on type + :param sources: sources string with `,` like ``avsox,javbus`` + :param type: ``adult``, ``general`` + """ + sc = Scraping() + return sc.search(number, sources, proxies=proxies, verify=verify, type=type, + dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline) + +class Scraping(): + """ + """ + + adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', + 'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', + 'gcolle', 'javdb', 'getchu'] + adult_func_mapping = { + 'avsox': Avsox().scrape, + 'javbus': Javbus().scrape, + 'xcity': Xcity().scrape, + 'mgstage': Mgstage().scrape, + 'madou': Madou().scrape, + 'fc2': Fc2().scrape, + 'dlsite': Dlsite().scrape, + 'jav321': Jav321().scrape, + 'fanza': Fanza().scrape, + 'airav': Airav().scrape, + 'carib': Carib().scrape, + 'mv91': Mv91().scrape, + 'gcolle': Gcolle().scrape, + 'javdb': Javdb().scrape, + 'getchu': Getchu().scrape, + } + + general_full_sources = ['tmdb'] + general_func_mapping = { + 'tmdb': Tmdb().scrape, + } + + proxies = None + verify = None + + dbcookies = None + dbsite = None + # 使用storyline方法进一步获取故事情节 + morestoryline = False + + def search(self, number, sources=None, proxies=None, verify=None, type='adult', + dbcookies=None, dbsite=None, morestoryline=False): + self.proxies = proxies + self.verify = verify + self.dbcookies = dbcookies + self.dbsite = dbsite + self.morestoryline = morestoryline + if type == 'adult': + return self.searchAdult(number, sources) + else: + return self.searchGeneral(number, sources) + + def searchGeneral(self, name, sources): + """ 查询电影电视剧 + imdb,tmdb + """ + sources = self.checkGeneralSources(sources, name) + json_data = {} + for source in sources: + try: + print('[+]select', source) + try: + data = self.general_func_mapping[source](name, self) + if data == 404: + continue + json_data = json.loads(data) + except Exception as e: + print('[!] 出错啦') + print(e) + # if any service return a valid return, break + if self.get_data_state(json_data): + print(f"[+]Find movie [{name}] metadata on website '{source}'") + break + except: + continue + + # Return if data not found in all sources + if not json_data: + print(f'[-]Movie Number [{name}] not found!') + return None + + return json_data + + def searchAdult(self, number, sources): + sources = self.checkAdultSources(sources, number) + json_data = {} + for source in sources: + try: + print('[+]select', source) + try: + data = self.adult_func_mapping[source](number, self) + if data == 404: + continue + json_data = json.loads(data) + except Exception as e: + print('[!] 出错啦') + print(e) + # json_data = self.func_mapping[source](number, self) + # if any service return a valid return, break + if self.get_data_state(json_data): + print(f"[+]Find movie [{number}] metadata on website '{source}'") + break + except: + continue + + # Return if data not found in all sources + if not json_data: + print(f'[-]Movie Number [{number}] not found!') + return None + + return json_data + + def checkGeneralSources(self, c_sources, name): + if not c_sources: + sources = self.general_full_sources + else: + sources = c_sources.split(',') + + # check sources in func_mapping + todel = [] + for s in sources: + if not s in self.general_func_mapping: + print('[!] Source Not Exist : ' + s) + todel.append(s) + for d in todel: + print('[!] Remove Source : ' + s) + sources.remove(d) + return sources + + def checkAdultSources(self, c_sources, file_number): + if not c_sources: + sources = self.adult_full_sources + else: + sources = c_sources.split(',') + def insert(sources,source): + if source in sources: + sources.insert(0, sources.pop(sources.index(source))) + return sources + + if len(sources) <= len(self.adult_func_mapping): + # if the input file name matches certain rules, + # move some web service to the beginning of the list + lo_file_number = file_number.lower() + if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number) + ): + sources = insert(sources,"carib") + elif "item" in file_number or "GETCHU" in file_number.upper(): + sources = insert(sources,"getchu") + elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number): + sources = insert(sources, "getchu") + sources = insert(sources, "dlsite") + elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number: + if "avsox" in sources: + sources = insert(sources,"avsox") + elif "mgstage" in sources and \ + (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number): + sources = insert(sources,"mgstage") + elif "fc2" in lo_file_number: + if "fc2" in sources: + sources = insert(sources,"fc2") + elif "gcolle" in sources and (re.search("\d{6}", file_number)): + sources = insert(sources,"gcolle") + elif re.search(r"^[a-z0-9]{3,}$", lo_file_number): + if "xcity" in sources: + sources = insert(sources,"xcity") + if "madou" in sources: + sources = insert(sources,"madou") + elif "madou" in sources and ( + re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) + ): + sources = insert(sources,"madou") + + # check sources in func_mapping + todel = [] + for s in sources: + if not s in self.adult_func_mapping: + print('[!] Source Not Exist : ' + s) + todel.append(s) + for d in todel: + print('[!] Remove Source : ' + s) + sources.remove(d) + return sources + + def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测 + if "title" not in data or "number" not in data: + return False + if data["title"] is None or data["title"] == "" or data["title"] == "null": + return False + if data["number"] is None or data["number"] == "" or data["number"] == "null": + return False + return True diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py new file mode 100644 index 0000000..3fde11e --- /dev/null +++ b/scrapinglib/avsox.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +import re +from .parser import Parser + + +class Avsox(Parser): + + source = 'avsox' + imagecut = 3 + + expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()' + expr_actor = '//a[@class="avatar-box"]' + expr_actorphoto = '//a[@class="avatar-box"]' + expr_title = '/html/body/div[2]/h3/text()' + expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()' + expr_release = '//span[contains(text(),"发行时间:")]/../text()' + expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src' + expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src' + expr_tags = '/html/head/meta[@name="keywords"]/@content' + expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()' + expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()' + + def queryNumberUrl(self, number): + qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') + site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href') + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + if result1 == '' or result1 == 'null' or result1 == 'None': + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_')) + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + if result1 == '' or result1 == 'null' or result1 == 'None': + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', '')) + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + return "https:" + result1 + + def getNum(self, htmltree): + new_number = self.getTreeElement(htmltree, self.expr_number) + if new_number.upper() != self.number.upper(): + raise ValueError('number not found in ' + self.source) + self.number = new_number + return new_number + + def getTitle(self, htmltree): + return super().getTitle(htmltree).replace('/', '').strip(self.number) + + def getStudio(self, htmltree): + return super().getStudio(htmltree).replace("', '", ' ') + + def getSmallCover(self, htmltree): + """ 使用搜索页面的预览小图 + """ + return self.getTreeElement(self.searchtree, self.expr_smallcover) + + def getTags(self, htmltree): + tags = super().getTags(htmltree).split(',') + return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] + + def getOutline(self, htmltree): + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number) + return '' + + def getActors(self, htmltree): + a = super().getActors(htmltree) + d = [] + for i in a: + d.append(i.find('span').text) + return d + + def getActorPhoto(self, htmltree): + a = super().getActorPhoto(htmltree) + d = {} + for i in a: + l = i.find('.//img').attrib['src'] + t = i.find('span').text + p2 = {t: l} + d.update(p2) + return d diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py new file mode 100644 index 0000000..9fac553 --- /dev/null +++ b/scrapinglib/carib.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +import re +from urllib.parse import urljoin +from lxml import html +from .parser import Parser + + +class Carib(Parser): + source = 'carib' + uncensored = True + + expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()" + expr_release = "//li[2]/span[@class='spec-content']/text()" + expr_runtime = "//span[@class='spec-content']/span[@itemprop='duration']/text()" + expr_actor = "//span[@class='spec-content']/a[@itemprop='actor']/span/text()" + expr_tags = "//span[@class='spec-content']/a[@itemprop='genre']/text()" + expr_extrafanart = "//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href" + expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" + expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" + expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()" + + def search(self, number): + self.number = number + self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' + htmlcode = self.getHtml(self.detailurl) + if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: + return 404 + htmltree = html.fromstring(htmlcode) + result = self.dictformat(htmltree) + return result + + def getStudio(self, htmltree): + return '加勒比' + + def getActors(self, htmltree): + r = [] + actors = super().getActors(htmltree) + for act in actors: + if str(act) != '他': + r.append(act) + return r + + def getNum(self, htmltree): + return self.number + + def getCover(self, htmltree): + return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg' + + def getTags(self, htmltree): + return self.getTreeAll(htmltree, self.expr_tags) + + def getExtrafanart(self, htmltree): + r = [] + genres = self.getTreeAll(htmltree, self.expr_extrafanart) + for g in genres: + jpg = str(g) + if '/member/' in jpg: + break + else: + r.append('https://www.caribbeancom.com' + jpg) + return r + + def getActorPhoto(self, htmltree): + # return super().getActorPhoto(htmltree) + htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") + names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") + t = {} + for name, a in zip(names, htmla): + if name.strip() == '他': + continue + p = {name.strip(): a.attrib['href']} + t.update(p) + o = {} + for k, v in t.items(): + if '/search_act/' not in v: + continue + r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object') + if not r.ok: + continue + html = r.text + pos = html.find('.full-bg') + if pos<0: + continue + css = html[pos:pos+100] + cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) + if not cssBGjpgs or not len(cssBGjpgs[0]): + continue + p = {k: urljoin(r.url, cssBGjpgs[0])} + o.update(p) + return o + + def getOutline(self, htmltree): + from .storyline import getStoryline + result = getStoryline(self.number, uncensored=self.uncensored) + if len(result): + return result + return super().getOutline(htmltree) + diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py new file mode 100644 index 0000000..25f1203 --- /dev/null +++ b/scrapinglib/dlsite.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from .parser import Parser + + +class Dlsite(Parser): + source = 'dlsite' + imagecut = 4 + allow_number_change = True + + expr_title = '/html/head/title/text()' + expr_actor = '//th[contains(text(),"声优")]/../td/a/text()' + expr_studio = '//th[contains(text(),"商标名")]/../td/span[1]/a/text()' + expr_studio2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_runtime = '//strong[contains(text(),"時長")]/../span/text()' + expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()' + expr_outline = '//*[@class="work_parts_area"]/p/text()' + expr_series = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()' + expr_series2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_director = '//th[contains(text(),"剧情")]/../td/a/text()' + expr_release = '//th[contains(text(),"贩卖日")]/../td/a/text()' + expr_cover = '//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset' + expr_tags = '//th[contains(text(),"分类")]/../td/div/a/text()' + expr_label = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()' + expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src' + + def search(self, number): + self.cookies = {'locale': 'zh-cn'} + if "RJ" in number or "VJ" in number: + self.number = number.upper() + self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' + htmltree = self.getHtmlTree(self.detailurl) + else: + self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie' + htmltree = self.getHtmlTree(self.detailurl) + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","") + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + if "~" in number: + number = number.replace("~","〜") + elif "〜" in number: + number = number.replace("〜","~") + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '') + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + self.detailurl = search_result[0] + htmltree = self.getHtmlTree(self.detailurl) + self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") + + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + return self.number + + def getTitle(self, htmltree): + result = super().getTitle(htmltree) + result = result[:result.rfind(' | DLsite')] + result = result[:result.rfind(' [')] + if 'OFF】' in result: + result = result[result.find('】')+1:] + result = result.replace('【HD版】', '') + return result + + def getOutline(self, htmltree): + total = [] + result = self.getTreeAll(htmltree, self.expr_outline) + for i in result: + total.append(i.strip('\r\n')) + return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") + + def getRelease(self, htmltree): + return super().getRelease(htmltree).replace('年','-').replace('月','-').replace('日','') + + def getCover(self, htmltree): + return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg') + + def getTags(self, htmltree): + return self.getTreeAll(htmltree, self.expr_tags) + + def getExtrafanart(self, htmltree): + try: + result = [] + for i in self.getTreeAll(self.expr_extrafanart): + result.append("https:" + i) + except: + result = '' + return result diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py new file mode 100644 index 0000000..1f60d3e --- /dev/null +++ b/scrapinglib/fanza.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urlencode +from .parser import Parser + + +class Fanza(Parser): + source = 'fanza' + + expr_title = '//*[starts-with(@id, "title")]/text()' + expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" + expr_cover = '//head/meta[@property="og:image"]' + expr_extrafanart = '//a[@name="sample-image"]/img/@src' + expr_outline = "//div[@class='mg-b20 lh4']/text()" + expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" + expr_outline_og = '//head/meta[@property="og:description"]' + expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" + + def search(self, number): + self.number = number + # fanza allow letter + number + underscore, normalize the input here + # @note: I only find the usage of underscore as h_test123456789 + fanza_search_number = number + # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix + if fanza_search_number.startswith("h-"): + fanza_search_number = fanza_search_number.replace("h-", "h_") + + fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() + + fanza_urls = [ + "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", + "https://www.dmm.co.jp/rental/-/detail/=/cid=", + ] + + for url in fanza_urls: + self.detailurl = url + fanza_search_number + url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl}) + self.htmlcode = self.getHtml(url) + if self.htmlcode != 404: + self.htmltree = etree.HTML(self.htmlcode) + break + if self.htmlcode == 404: + return 404 + result = self.dictformat(self.htmltree) + return result + + def getNum(self, htmltree): + # for some old page, the input number does not match the page + # for example, the url will be cid=test012 + # but the hinban on the page is test00012 + # so get the hinban first, and then pass it to following functions + self.fanza_hinban = self.getFanzaString('品番:') + self.number = self.fanza_hinban + number_lo = self.number.lower() + if (re.sub('-|_', '', number_lo) == self.fanza_hinban or + number_lo.replace('-', '00') == self.fanza_hinban or + number_lo.replace('-', '') + 'so' == self.fanza_hinban + ): + self.number = self.number + return self.number + + def getStudio(self, htmltree): + return self.getFanzaString('メーカー') + + def getOutline(self, htmltree): + try: + result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "") + if result == '': + result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") + if "※ 配信方法によって収録内容が異なる場合があります。" == result: + result = self.getTreeElement(htmltree, self.expr_outline_og).get('content') + return result + except: + return '' + + def getRuntime(self, htmltree): + return str(re.search(r'\d+', super().getRuntime(htmltree)).group()).strip(" ['']") + + def getDirector(self, htmltree): + if "anime" not in self.detailurl: + return self.getFanzaString('監督:') + return '' + + def getActors(self, htmltree): + if "anime" not in self.detailurl: + return super().getActors(htmltree) + return '' + + def getRelease(self, htmltree): + result = self.getFanzaString('発売日:') + if result == '' or result == '----': + result = self.getFanzaString('配信開始日:') + return result.replace("/", "-").strip('\\n') + + def getCover(self, htmltree): + return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content') + + def getTags(self, htmltree): + return self.getFanzaStrings('ジャンル:') + + def getLabel(self, htmltree): + ret = self.getFanzaStrings('レーベル') + if ret == "----": + return '' + return ret + + def getSeries(self, htmltree): + ret = self.getFanzaStrings('シリーズ:') + if ret == "----": + return '' + return ret + + def getFanzaString(self, expr): + result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']") + result2 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/text()")).strip(" ['']") + return result1+result2 + + def getFanzaStrings(self, string): + result1 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()") + if len(result1) > 0: + return result1 + result2 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()") + return result2 diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py new file mode 100644 index 0000000..c12a1ce --- /dev/null +++ b/scrapinglib/fc2.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urljoin + +from .parser import Parser + + +class Fc2(Parser): + source = 'fc2' + imagecut = 0 + + expr_title = '/html/head/title/text()' + expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_release = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()' + expr_runtime = "//p[@class='items_article_info']/text()" + expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src" + expr_tags = "//a[@class='tag tagTag']/text()" + + def search(self, number): + self.number = number.replace('FC2-', '').replace('fc2-', '') + self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.HTML(self.htmlcode) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + return 'FC2-' + self.number + + def getRelease(self, htmltree): + return super().getRelease(htmltree).strip(" ['販売日 : ']").replace('/','-') + + def getActors(self, htmltree): + actors = super().getActors(htmltree) + if not actors: + actors = '素人' + return actors + + def getCover(self, htmltree): + return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree)) + + def getExtrafanart(self, htmltree): + html_pather = re.compile(r'