diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
deleted file mode 100644
index 7546802..0000000
--- a/WebCrawler/__init__.py
+++ /dev/null
@@ -1,372 +0,0 @@
-import json
-import re
-from multiprocessing.pool import ThreadPool
-
-import ADC_function
-import config
-from ADC_function import translate
-from lxml import etree
-from pathlib import Path
-
-# =========website========
-from . import airav
-from . import avsox
-from . import fanza
-from . import fc2
-from . import jav321
-from . import javbus
-from . import javdb
-from . import mgstage
-from . import xcity
-# from . import javlib
-from . import dlsite
-from . import carib
-from . import fc2club
-from . import mv91
-from . import madou
-from . import gcolle
-from . import getchu
-
-
-def get_data_state(data: dict) -> bool: # 元数据获取失败检测
- if "title" not in data or "number" not in data:
- return False
-
- if data["title"] is None or data["title"] == "" or data["title"] == "null":
- return False
-
- if data["number"] is None or data["number"] == "" or data["number"] == "null":
- return False
-
- return True
-
-
-def get_data_from_json(file_number, oCC):
- """
- iterate through all services and fetch the data 从JSON返回元数据
- """
-
- actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
- info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
-
- func_mapping = {
- "airav": airav.main,
- "avsox": avsox.main,
- "fc2": fc2.main,
- "fanza": fanza.main,
- "javdb": javdb.main,
- "javbus": javbus.main,
- "mgstage": mgstage.main,
- "jav321": jav321.main,
- "xcity": xcity.main,
- # "javlib": javlib.main,
- "dlsite": dlsite.main,
- "carib": carib.main,
- "fc2club": fc2club.main,
- "mv91": mv91.main,
- "madou": madou.main,
- "gcolle": gcolle.main,
- "getchu": getchu.main,
- }
-
- conf = config.getInstance()
- # default fetch order list, from the beginning to the end
- sources = conf.sources().split(',')
- def insert(sources,source):
- if source in sources:
- sources.insert(0, sources.pop(sources.index(source)))
- return sources
-
- if len(sources) <= len(func_mapping):
- # if the input file name matches certain rules,
- # move some web service to the beginning of the list
- lo_file_number = file_number.lower()
- if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
- ):
- sources = insert(sources,"carib")
- elif "item" in file_number or "GETCHU" in file_number.upper():
- sources = insert(sources,"getchu")
- elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
- sources = insert(sources, "getchu")
- sources = insert(sources, "dlsite")
- elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
- if "avsox" in sources:
- sources = insert(sources,"avsox")
- elif "mgstage" in sources and \
- (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
- sources = insert(sources,"mgstage")
- elif "fc2" in lo_file_number:
- if "fc2" in sources:
- sources = insert(sources,"fc2")
- elif "gcolle" in sources and (re.search("\d{6}", file_number)):
- sources = insert(sources,"gcolle")
- elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
- if "xcity" in sources:
- sources = insert(sources,"xcity")
- if "madou" in sources:
- sources = insert(sources,"madou")
- elif "madou" in sources and (
- re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
- ):
- sources = insert(sources,"madou")
-
- # check sources in func_mapping
- todel = []
- for s in sources:
- if not s in func_mapping:
- print('[!] Source Not Exist : ' + s)
- todel.append(s)
- for d in todel:
- print('[!] Remove Source : ' + s)
- sources.remove(d)
-
- json_data = {}
-
- if conf.multi_threading():
- pool = ThreadPool(processes=len(conf.sources().split(',')))
-
- # Set the priority of multi-thread crawling and join the multi-thread queue
- for source in sources:
- pool.apply_async(func_mapping[source], (file_number,))
-
- # Get multi-threaded crawling response
- for source in sources:
- if conf.debug() == True:
- print('[+]select', source)
- try:
- json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
- except:
- json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
- # if any service return a valid return, break
- if get_data_state(json_data):
- print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
- break
- pool.close()
- pool.terminate()
- else:
- for source in sources:
- try:
- if conf.debug() == True:
- print('[+]select', source)
- try:
- json_data = json.loads(func_mapping[source](file_number))
- except:
- json_data = func_mapping[source](file_number)
- # if any service return a valid return, break
- if get_data_state(json_data):
- print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
- break
- except:
- break
-
- # Return if data not found in all sources
- if not json_data:
- print('[-]Movie Number not found!')
- return None
-
- # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障
- # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
- # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
- # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。
- if str(json_data.get('number')).upper() != file_number.upper():
- try:
- if json_data.get('allow_number_change'):
- pass
- except:
- print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
- return None
-
- # ================================================网站规则添加结束================================================
-
- title = json_data.get('title')
- actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
- actor_list = [actor.strip() for actor in actor_list] # 去除空白
- director = json_data.get('director')
- release = json_data.get('release')
- number = json_data.get('number')
- studio = json_data.get('studio')
- source = json_data.get('source')
- runtime = json_data.get('runtime')
- outline = json_data.get('outline')
- label = json_data.get('label')
- series = json_data.get('series')
- year = json_data.get('year')
-
- if json_data.get('cover_small'):
- cover_small = json_data.get('cover_small')
- else:
- cover_small = ''
-
- if json_data.get('trailer'):
- trailer = json_data.get('trailer')
- else:
- trailer = ''
-
- if json_data.get('extrafanart'):
- extrafanart = json_data.get('extrafanart')
- else:
- extrafanart = ''
-
- imagecut = json_data.get('imagecut')
- tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
- while 'XXXX' in tag:
- tag.remove('XXXX')
- while 'xxx' in tag:
- tag.remove('xxx')
- actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
-
- if title == '' or number == '':
- print('[-]Movie Number or Title not found!')
- return None
-
- # if imagecut == '3':
- # DownloadFileWithFilename()
-
- # ====================处理异常字符====================== #\/:*?"<>|
- actor = special_characters_replacement(actor)
- actor_list = [special_characters_replacement(a) for a in actor_list]
- title = special_characters_replacement(title)
- label = special_characters_replacement(label)
- outline = special_characters_replacement(outline)
- series = special_characters_replacement(series)
- studio = special_characters_replacement(studio)
- director = special_characters_replacement(director)
- tag = [special_characters_replacement(t) for t in tag]
- release = release.replace('/', '-')
- tmpArr = cover_small.split(',')
- if len(tmpArr) > 0:
- cover_small = tmpArr[0].strip('\"').strip('\'')
- # ====================处理异常字符 END================== #\/:*?"<>|
-
- # 返回处理后的json_data
- json_data['title'] = title
- json_data['original_title'] = title
- json_data['actor'] = actor
- json_data['release'] = release
- json_data['cover_small'] = cover_small
- json_data['tag'] = tag
- json_data['year'] = year
- json_data['actor_list'] = actor_list
- json_data['trailer'] = trailer
- json_data['extrafanart'] = extrafanart
- json_data['label'] = label
- json_data['outline'] = outline
- json_data['series'] = series
- json_data['studio'] = studio
- json_data['director'] = director
-
- if conf.is_translate():
- translate_values = conf.translate_values().split(",")
- for translate_value in translate_values:
- if json_data[translate_value] == "":
- continue
- if translate_value == "title":
- title_dict = json.loads(
- (Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
- try:
- json_data[translate_value] = title_dict[number]
- continue
- except:
- pass
- if conf.get_translate_engine() == "azure":
- t = translate(
- json_data[translate_value],
- target_language="zh-Hans",
- engine=conf.get_translate_engine(),
- key=conf.get_translate_key(),
- )
- else:
- t = translate(json_data[translate_value])
- if len(t):
- json_data[translate_value] = special_characters_replacement(t)
-
- if oCC:
- cc_vars = conf.cc_convert_vars().split(",")
- ccm = conf.cc_convert_mode()
- def convert_list(mapping_data,language,vars):
- total = []
- for i in vars:
- if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
- i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
- total.append(i)
- return total
- def convert(mapping_data,language,vars):
- if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
- return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
- else:
- raise IndexError('keyword not found')
- for cc in cc_vars:
- if json_data[cc] == "" or len(json_data[cc]) == 0:
- continue
- if cc == "actor":
- try:
- if ccm == 1:
- json_data['actor_list'] = convert_list(actor_mapping_data, "zh_cn", json_data['actor_list'])
- json_data['actor'] = convert(actor_mapping_data, "zh_cn", json_data['actor'])
- elif ccm == 2:
- json_data['actor_list'] = convert_list(actor_mapping_data, "zh_tw", json_data['actor_list'])
- json_data['actor'] = convert(actor_mapping_data, "zh_tw", json_data['actor'])
- elif ccm == 3:
- json_data['actor_list'] = convert_list(actor_mapping_data, "jp", json_data['actor_list'])
- json_data['actor'] = convert(actor_mapping_data, "jp", json_data['actor'])
- except:
- json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
- json_data['actor'] = oCC.convert(json_data['actor'])
- elif cc == "tag":
- try:
- if ccm == 1:
- json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
- elif ccm == 2:
- json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
- elif ccm == 3:
- json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
- except:
- json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
- else:
- try:
- if ccm == 1:
- json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
- elif ccm == 2:
- json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
- elif ccm == 3:
- json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
- json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
- except IndexError:
- json_data[cc] = oCC.convert(json_data[cc])
- except:
- pass
-
- naming_rule=""
- for i in conf.naming_rule().split("+"):
- if i not in json_data:
- naming_rule += i.strip("'").strip('"')
- else:
- item = json_data.get(i)
- naming_rule += item if type(item) is not list else "&".join(item)
-
- json_data['naming_rule'] = naming_rule
- return json_data
-
-
-def special_characters_replacement(text) -> str:
- if not isinstance(text, str):
- return text
- return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane
- replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
- replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
- replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
- replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
- replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
- replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
- replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
- replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
- replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
- replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
- replace('…','…').
- replace('&', '&')
- )
diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py
deleted file mode 100644
index d25b8ad..0000000
--- a/WebCrawler/airav.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import sys
-sys.path.append('../')
-from bs4 import BeautifulSoup#need install
-from ADC_function import *
-from WebCrawler import javbus
-
-'''
-API
-注册:https://www.airav.wiki/api/auth/signup
-设置:https://www.airav.wiki/api/get_web_settings
-搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search=
-搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
-'''
-host = 'https://www.airav.wiki'
-
-# airav这个网站没有演员图片,所以直接使用javbus的图
-def getActorPhoto(javbus_json):
- result = javbus_json.get('actor_photo')
- if isinstance(result, dict) and len(result):
- return result
- return ''
-
-def getTitle(htmlcode): #获取标题
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- title = str(html.xpath('/html/head/title/text()')[0])
- result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
- return result
-
-def getStudio(htmlcode, javbus_json): #获取厂商 已修改
- # javbus如果有数据以它为准
- result = javbus_json.get('studio')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode,etree.HTMLParser())
- return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
-def getYear(htmlcode, javbus_json): #获取年份
- result = javbus_json.get('year')
- if isinstance(result, str) and len(result):
- return result
- release = getRelease(htmlcode, javbus_json)
- if len(release) != len('2000-01-01'):
- return ''
- return release[:4]
-def getCover(htmlcode, javbus_json): #获取封面图片
- result = javbus_json.get('cover')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
-def getRelease(htmlcode, javbus_json): #获取出版日期
- result = javbus_json.get('release')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- try:
- result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
- except:
- return ''
- return result
-def getRuntime(javbus_json): #获取播放时长
- result = javbus_json.get('runtime')
- if isinstance(result, str) and len(result):
- return result
- return ''
-# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先
-def getActor(htmlcode, javbus_json): #获取女优
- b=[]
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
- for v in a:
- v = v.strip()
- if len(v):
- b.append(v)
- if len(b):
- return b
- result = javbus_json.get('actor')
- if isinstance(result, list) and len(result):
- return result
- return []
-def getNum(htmlcode, javbus_json): #获取番号
- result = javbus_json.get('number')
- if isinstance(result, str) and len(result):
- return result
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- title = str(html.xpath('/html/head/title/text()')[0])
- result = str(re.findall('^\[(.*?)]', title)[0])
- return result
-def getDirector(javbus_json): #获取导演 已修改
- result = javbus_json.get('director')
- if isinstance(result, str) and len(result):
- return result
- return ''
-def getOutline(htmlcode): #获取概述
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- try:
- result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
- return result
- except:
- return ''
-def getSerise(javbus_json): #获取系列 已修改
- result = javbus_json.get('series')
- if isinstance(result, str) and len(result):
- return result
- return ''
-def getTag(htmlcode): # 获取标签
- tag = []
- soup = BeautifulSoup(htmlcode, 'lxml')
- x = soup.find_all(attrs={'class': 'tagBtnMargin'})
- a = x[0].find_all('a')
-
- for i in a:
- tag.append(i.get_text())
- return tag
-
-def getExtrafanart(htmlcode): # 获取剧照
- html_pather = re.compile(r'
[\s\S]*?
')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r' 0:
- # search_result = {"offset": 0,"count": 4,"result": [
- # {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
- # "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
- # {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
- # "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
- # {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
- # "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
- # {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
- # "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
- # ],"status": "ok"}
- search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
-
- try:
- json_data = json.loads(search_result)
- except json.decoder.JSONDecodeError:
- # print("[-]Json decoder error!")
- return []
-
- result_offset = int(json_data["offset"])
- result_count = int(json_data["count"])
- result_size = len(json_data["result"])
- if result_count <= 0 or result_size <= 0:
- return result
- elif result_count > result_offset + result_size: #请求下一页内容
- result.extend(json_data["result"])
- page += 1
- elif result_count == result_offset + result_size: #请求最后一页内容
- result.extend(json_data["result"])
- page = 0
- else:
- page = 0
-
- return result
-
-def main(number):
- try:
- try:
- htmlcode = get_html('https://cn.airav.wiki/video/' + number)
- javbus_json = json.loads(javbus.main(number))
-
- except:
- # print(number)
- pass
-
- dic = {
- # 标题可使用airav
- 'title': getTitle(htmlcode),
- # 制作商先找javbus,如果没有再找本站
- 'studio': getStudio(htmlcode, javbus_json),
- # 年份先试javbus,如果没有再找本站
- 'year': getYear(htmlcode, javbus_json),
- # 简介 使用 airav
- 'outline': getOutline(htmlcode),
- # 使用javbus
- 'runtime': getRuntime(javbus_json),
- # 导演 使用javbus
- 'director': getDirector(javbus_json),
- # 演员 先试airav
- 'actor': getActor(htmlcode, javbus_json),
- # 发售日先试javbus
- 'release': getRelease(htmlcode, javbus_json),
- # 番号使用javbus
- 'number': getNum(htmlcode, javbus_json),
- # 封面链接 使用javbus
- 'cover': getCover(htmlcode, javbus_json),
- # 剧照获取
- 'extrafanart': getExtrafanart(htmlcode),
- 'imagecut': 1,
- # 使用 airav
- 'tag': getTag(htmlcode),
- # 使用javbus
- 'label': getSerise(javbus_json),
- 'actor_photo': getActorPhoto(javbus_json),
- 'website': 'https://www.airav.wiki/video/' + number,
- 'source': 'airav.py',
- # 使用javbus
- 'series': getSerise(javbus_json)
- }
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
- return js
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- data = {
- "title": "",
- }
- js = json.dumps(
- data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
- )
- return js
-
-
-if __name__ == '__main__':
- config.getInstance().set_override("actor_photo:download_for_kodi=1")
- config.getInstance().set_override("debug_mode:switch=1")
- print(main('ADV-R0624')) # javbus页面返回404, airav有数据
- print(main('ADN-188')) # 一人
- print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字
diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
deleted file mode 100644
index a18eab6..0000000
--- a/WebCrawler/avsox.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import sys
-sys.path.append('..')
-from ADC_function import *
-from WebCrawler.storyline import getStoryline
-from WebCrawler.crawler import *
-# import io
-# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-
-def getActorPhoto(html):
- a = html.xpath('//a[@class="avatar-box"]')
- d = {}
- for i in a:
- l = i.find('.//img').attrib['src']
- t = i.find('span').text
- p2 = {t: l}
- d.update(p2)
- return d
-
-def getActor(html):
- a = html.xpath('//a[@class="avatar-box"]')
- d = []
- for i in a:
- d.append(i.find('span').text)
- return d
-
-def getCover_small(html):
- result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
- return result
-def getTag(html):
- x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
- return [i.strip() for i in x[2:]] if len(x) > 2 else []
-
-def main(number):
- html = get_html('https://tellme.pw/avsox')
- site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
- a = get_html(site + '/cn/search/' + number)
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- if result1 == '' or result1 == 'null' or result1 == 'None':
- a = get_html(site + '/cn/search/' + number.replace('-', '_'))
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- if result1 == '' or result1 == 'null' or result1 == 'None':
- a = get_html(site + '/cn/search/' + number.replace('_', ''))
- html = Crawler(a)
- result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
- detail = get_html("https:" + result1)
- lx = etree.fromstring(detail, etree.HTMLParser())
- avsox_crawler2 = Crawler(a)
- avsox_crawler = Crawler(detail)
- try:
- new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
- if new_number.upper() != number.upper():
- raise ValueError('number not found')
- title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
- dic = {
- 'actor': getActor(lx),
- 'title': title,
- 'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
- 'outline': getStoryline(number, title),
- 'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
- 'director': '', #
- 'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
- 'number': new_number,
- 'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
- #'cover_small' : getCover_small(html),
- 'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
- 'imagecut': 3,
- 'tag': getTag(lx),
- 'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
- 'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
- 'actor_photo': getActorPhoto(lx),
- 'website': "https:" + result1,
- 'source': 'avsox.py',
- 'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
- }
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- dic = {"title": ""}
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
- return js
-
-if __name__ == "__main__":
- print(main('012717_472'))
- print(main('1')) # got fake result raise 'number not found'
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
deleted file mode 100755
index 50cbcc1..0000000
--- a/WebCrawler/carib.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import sys
-sys.path.append('../')
-from lxml import html
-from ADC_function import *
-from WebCrawler.storyline import getStoryline
-
-
-G_SITE = 'https://www.caribbeancom.com'
-
-
-def main(number: str) -> json:
- try:
- url = f'{G_SITE}/moviepages/{number}/index.html'
- result, session = get_html_session(url, return_type='session')
- htmlcode = result.content.decode('euc-jp')
- if not result or not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode:
- raise ValueError("page not found")
-
- lx = html.fromstring(htmlcode)
- title = get_title(lx)
-
- dic = {
- 'title': title,
- 'studio': '加勒比',
- 'year': get_year(lx),
- 'outline': get_outline(lx, number, title),
- 'runtime': get_runtime(lx),
- 'director': '',
- 'actor': get_actor(lx),
- 'release': get_release(lx),
- 'number': number,
- 'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
- 'tag': get_tag(lx),
- 'extrafanart': get_extrafanart(lx),
- 'label': get_series(lx),
- 'imagecut': 1,
- 'website': f'{G_SITE}/moviepages/{number}/index.html',
- 'source': 'carib.py',
- 'series': get_series(lx),
- '无码': True
- }
- if config.getInstance().download_actor_photo_for_kodi():
- dic['actor_photo'] = get_actor_photo(lx, session)
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
- return js
-
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- dic = {"title": ""}
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
-
-def get_title(lx: html.HtmlElement) -> str:
- return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
-
-def get_year(lx: html.HtmlElement) -> str:
- return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
-
-def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
- o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
- g = getStoryline(number, title, 无码=True)
- if len(g):
- return g
- return o
-
-def get_release(lx: html.HtmlElement) -> str:
- return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
-
-def get_actor(lx: html.HtmlElement):
- r = []
- actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
- for act in actors:
- if str(act) != '他':
- r.append(act)
- return r
-
-def get_tag(lx: html.HtmlElement) -> str:
- genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
- return genres
-
-def get_extrafanart(lx: html.HtmlElement) -> str:
- r = []
- genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
- for g in genres:
- jpg = str(g)
- if '/member/' in jpg:
- break
- else:
- r.append('https://www.caribbeancom.com' + jpg)
- return r
-
-def get_series(lx: html.HtmlElement) -> str:
- try:
- return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
- except:
- return ''
-
-def get_runtime(lx: html.HtmlElement) -> str:
- return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
-
-def get_actor_photo(lx, session):
- htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
- names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
- t = {}
- for name, a in zip(names, htmla):
- if name.strip() == '他':
- continue
- p = {name.strip(): a.attrib['href']}
- t.update(p)
- o = {}
- for k, v in t.items():
- if '/search_act/' not in v:
- continue
- r = session.get(urljoin(G_SITE, v))
- if not r.ok:
- continue
- html = r.text
- pos = html.find('.full-bg')
- if pos<0:
- continue
- css = html[pos:pos+100]
- cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
- if not cssBGjpgs or not len(cssBGjpgs[0]):
- continue
- p = {k: urljoin(r.url, cssBGjpgs[0])}
- o.update(p)
- return o
-
-if __name__ == "__main__":
- print(main("070116-197")) # actor have photo
- print(main("041721-001"))
- print(main("080520-001"))
diff --git a/WebCrawler/crawler.py b/WebCrawler/crawler.py
deleted file mode 100644
index e6176b6..0000000
--- a/WebCrawler/crawler.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from lxml import etree
-
-class Crawler:
- def __init__(self,htmlcode):
- self.html = etree.HTML(htmlcode)
-
- def getString(self,_xpath):
- if _xpath == "":
- return ""
- result = self.html.xpath(_xpath)
- try:
- return result[0]
- except:
- return ""
-
- def getStrings(self,_xpath):
- result = self.html.xpath(_xpath)
- try:
- return result
- except:
- return ""
-
- def getOutline(self,_xpath):
- result = self.html.xpath(_xpath)
- try:
- return "\n".join(result)
- except:
- return ""
\ No newline at end of file
diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py
deleted file mode 100644
index 54ed6f7..0000000
--- a/WebCrawler/dlsite.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import re
-from lxml import etree
-import json
-import sys
-sys.path.append('../')
-from ADC_function import *
-
-def getTitle(html):
- result = str(html.xpath('/html/head/title/text()')[0])
- result = result[:result.rfind(' | DLsite')]
- result = result[:result.rfind(' [')]
- result = result.replace('【HD版】', '')
- return result
-def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
- try:
- result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
- except:
- result1 = ''
- return result1
-def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
- a = actor.split(',')
- d={}
- for i in a:
- p={i:''}
- d.update(p)
- return d
-def getStudio(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
- result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
- return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-def getYear(getRelease):
- try:
- result = str(re.search('\d{4}', getRelease).group())
- return result
- except:
- return getRelease
-def getRelease(html):
- result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
- return result1.replace('年','-').replace('月','-').replace('日','')
-def getTag(html):
- try:
- result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
- return result
- except:
- return ''
-
-def getCover_small(a, index=0):
- # same issue mentioned below,
- # javdb sometime returns multiple results
- # DO NOT just get the firt one, get the one with correct index number
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- try:
- result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
- if not 'https' in result:
- result = 'https:' + result
- return result
- except: # 2020.7.17 Repair Cover Url crawl
- result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
- if not 'https' in result:
- result = 'https:' + result
- return result
-def getCover(html):
- result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
- return result.replace('.webp', '.jpg')
-def getDirector(html):
- try:
- result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
- except:
- result = ''
- return result
-def getOutline(html):
- total = []
- result = html.xpath('//*[@class="work_parts_area"]/p/text()')
- for i in result:
- total.append(i.strip('\r\n'))
- return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
-def getSeries(html):
- try:
- try:
- result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
- except:
- result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
- except:
- result = ''
- return result
-#
-def getExtrafanart(html):
- try:
- result = []
- for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'):
- result.append("https:" + i)
- except:
- result = ''
- return result
-def main(number):
- try:
- if "RJ" in number or "VJ" in number:
- number = number.upper()
- htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'})
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- else:
- htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})
- html = etree.HTML(htmlcode)
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- if "~" in number:
- number = number.replace("~","〜")
- elif "〜" in number:
- number = number.replace("〜","~")
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- if len(search_result) == 0:
- number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
- html = etree.HTML(get_html(
- f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
- search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
- a = search_result[0]
- html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'}))
- number = str(re.findall("\wJ\w+",a)).strip(" [']")
- dic = {
- 'actor': getStudio(html),
- 'title': getTitle(html),
- 'studio': getStudio(html),
- 'outline': getOutline(html),
- 'runtime': '',
- 'director': getDirector(html),
- 'release': getRelease(html),
- 'number': number,
- 'cover': 'https:' + getCover(html),
- 'cover_small': '',
- 'imagecut': 4,
- 'tag': getTag(html),
- 'label': getLabel(html),
- 'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
- 'actor_photo': '',
- 'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
- 'source': 'dlsite.py',
- 'series': getSeries(html),
- 'extrafanart':getExtrafanart(html),
- 'allow_number_change':True,
- }
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
- return js
- except Exception as e:
- if config.getInstance().debug():
- print(e)
- data = {
- "title": "",
- }
- js = json.dumps(
- data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
- )
- return js
-
-# main('DV-1562')
-# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
-if __name__ == "__main__":
- config.getInstance().set_override("debug_mode:switch=1")
- print(main('牝教師4~穢された教壇~ 「生意気ドジっ娘女教師・美結~高飛車ハメ堕ち2濁金」'))
- print(main('RJ329607'))
diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py
deleted file mode 100644
index bae3713..0000000
--- a/WebCrawler/fanza.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import sys
-sys.path.append('../')
-from urllib.parse import urlencode
-
-from ADC_function import *
-from WebCrawler.crawler import *
-
-class fanzaCrawler(Crawler):
- def getFanzaString(self,string):
- result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
- result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
- return result1+result2
-
- def getFanzaStrings(self, string):
- result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
- if len(result1) > 0:
- return result1
- result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
- return result2
-
-
-def getRelease(fanza_Crawler):
- result = fanza_Crawler.getFanzaString('発売日:')
- if result == '' or result == '----':
- result = fanza_Crawler.getFanzaString('配信開始日:')
- return result.replace("/", "-").strip('\\n')
-
-
-def getCover(html, number):
- cover_number = number
- try:
- result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
- except:
- # sometimes fanza modify _ to \u0005f for image id
- if "_" in cover_number:
- cover_number = cover_number.replace("_", r"\u005f")
- try:
- result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
- except:
- # (TODO) handle more edge case
- # print(html)
- # raise exception here, same behavior as before
- # people's major requirement is fetching the picture
- raise ValueError("can not find image")
- return result
-
-
-def getOutline(html):
- try:
- result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
- if result == "":
- result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
- except:
- # (TODO) handle more edge case
- # print(html)
- return ""
- return result
-
-
-def getExtrafanart(htmlcode): # 获取剧照
- html_pather = re.compile(r'\n')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r'')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r' json:
- try:
- result = post_html(url="https://www.jav321.com/search", query={"sn": number})
- soup = BeautifulSoup(result.text, "html.parser")
- lx = html.fromstring(str(soup))
- except:
- dic = {"title": ""}
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
- if "/video/" in result.url:
- data = parse_info(soup)
-
- dic = {
- "title": get_title(lx),
- "year": get_year(data),
- "outline": get_outline(lx),
- "director": "",
- "cover": get_cover(lx),
- "imagecut": 1,
- "trailer": get_trailer(result.text),
- "extrafanart": get_extrafanart(result.text),
- "actor_photo": "",
- "website": result.url,
- "source": "jav321.py",
- **data,
- }
- else:
- dic = {"title": ""}
-
- return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-
-def get_title(lx: html.HtmlElement) -> str:
- return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
-
-
-def parse_info(soup: BeautifulSoup) -> dict:
- data = soup.select_one("div.row > div.col-md-9")
-
- if data:
- dd = str(data).split("
")
- data_dic = {}
- for d in dd:
- data_dic[get_bold_text(h=d)] = d
-
- return {
- "actor": get_actor(data_dic),
- "label": get_label(data_dic),
- "studio": get_studio(data_dic),
- "tag": get_tag(data_dic),
- "number": get_number(data_dic).upper(),
- "release": get_release(data_dic),
- "runtime": get_runtime(data_dic).replace(" minutes", ""),
- "series": get_series(data_dic),
- }
- else:
- return {"title": ""}
-
-
-def get_bold_text(h: str) -> str:
- soup = BeautifulSoup(h, "html.parser")
- if soup.b:
- return soup.b.text
- else:
- return "UNKNOWN_TAG"
-
-
-def get_anchor_info(h: str) -> str:
- result = []
-
- data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
- for d in data:
- result.append(d.text)
-
- return ",".join(result)
-
-
-def get_text_info(h: str) -> str:
- return h.split(": ")[1]
-
-def get_trailer(html) -> str:
- videourl_pather = re.compile(r'[\s\S]*?