Merge pull request #849 from Suwmlee/master

support specifiedUrl & javlibrary
This commit is contained in:
Yoshiko2
2022-07-31 01:41:21 +08:00
committed by GitHub
26 changed files with 451 additions and 316 deletions

View File

@@ -83,6 +83,8 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool, bool]:
help="""Only show job list of files and numbers, and **NO** actual operation help="""Only show job list of files and numbers, and **NO** actual operation
is performed. It may help you correct wrong numbers before real job.""") is performed. It may help you correct wrong numbers before real job.""")
parser.add_argument("-v", "--version", action="version", version=ver) parser.add_argument("-v", "--version", action="version", version=ver)
parser.add_argument("-ss", "--specified-source", default='', nargs='?', help="specified Source.")
parser.add_argument("-su", "--specified-url", default='', nargs='?', help="specified Url.")
args = parser.parse_args() args = parser.parse_args()
@@ -120,7 +122,7 @@ is performed. It may help you correct wrong numbers before real job.""")
if no_net_op: if no_net_op:
conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1") conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1")
return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op, args.specified_source, args.specified_url
class OutLogger(object): class OutLogger(object):
@@ -487,13 +489,13 @@ def create_data_and_move(movie_path: str, zero_op: bool, no_net_op: bool, oCC):
print('[!]', err) print('[!]', err)
def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC, specified_source, specified_url):
conf = config.getInstance() conf = config.getInstance()
file_name = os.path.basename(file_path) file_name = os.path.basename(file_path)
try: try:
print("[!] [{1}] As Number Processing for '{0}'".format(file_path, custom_number)) print("[!] [{1}] As Number Processing for '{0}'".format(file_path, custom_number))
if custom_number: if custom_number:
core_main(file_path, custom_number, oCC) core_main(file_path, custom_number, oCC, specified_source, specified_url)
else: else:
print("[-] number empty ERROR") print("[-] number empty ERROR")
print("[*]======================================================") print("[*]======================================================")
@@ -513,7 +515,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
def main(args: tuple) -> Path: def main(args: tuple) -> Path:
(single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op) = args (single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op, specified_source, specified_url) = args
conf = config.getInstance() conf = config.getInstance()
main_mode = conf.main_mode() main_mode = conf.main_mode()
folder_path = "" folder_path = ""
@@ -609,9 +611,11 @@ def main(args: tuple) -> Path:
print('[+]==================== Single File =====================') print('[+]==================== Single File =====================')
if custom_number == '': if custom_number == '':
create_data_and_move_with_custom_number(single_file_path, create_data_and_move_with_custom_number(single_file_path,
get_number(conf.debug(), os.path.basename(single_file_path)), oCC) get_number(conf.debug(), os.path.basename(single_file_path)), oCC,
specified_source, specified_url)
else: else:
create_data_and_move_with_custom_number(single_file_path, custom_number, oCC) create_data_and_move_with_custom_number(single_file_path, custom_number, oCC,
specified_source, specified_url)
else: else:
folder_path = conf.source_folder() folder_path = conf.source_folder()
if not isinstance(folder_path, str) or folder_path == '': if not isinstance(folder_path, str) or folder_path == '':

View File

@@ -404,8 +404,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
except: except:
pass pass
try: try:
f_rating = json_data['用户评分'] f_rating = json_data.get('userrating')
uc = json_data['评分人数'] uc = json_data.get('uservotes')
print(f""" <rating>{round(f_rating * 2.0, 1)}</rating> print(f""" <rating>{round(f_rating * 2.0, 1)}</rating>
<criticrating>{round(f_rating * 20.0, 1)}</criticrating> <criticrating>{round(f_rating * 20.0, 1)}</criticrating>
<ratings> <ratings>
@@ -760,7 +760,7 @@ def core_main_no_net_op(movie_path, number):
linkImage(path, number, part, leak_word, c_word, hack_word, ext) linkImage(path, number, part, leak_word, c_word, hack_word, ext)
def core_main(movie_path, number_th, oCC): def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=None):
conf = config.getInstance() conf = config.getInstance()
# =======================================================================初始化所需变量 # =======================================================================初始化所需变量
multi_part = 0 multi_part = 0
@@ -775,7 +775,7 @@ def core_main(movie_path, number_th, oCC):
# 下面被注释的变量不需要 # 下面被注释的变量不需要
#rootpath= os.getcwd #rootpath= os.getcwd
number = number_th number = number_th
json_data = get_data_from_json(number, oCC) # 定义番号 json_data = get_data_from_json(number, oCC, specified_source, specified_url) # 定义番号
# Return if blank dict returned (data not found) # Return if blank dict returned (data not found)
if not json_data: if not json_data:

View File

@@ -7,7 +7,7 @@ from pathlib import Path
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
from scrapinglib.api import search from scrapinglib.api import search
def get_data_from_json(file_number, oCC): def get_data_from_json(file_number, oCC, specified_source, specified_url):
""" """
iterate through all services and fetch the data 从JSON返回元数据 iterate through all services and fetch the data 从JSON返回元数据
""" """
@@ -51,9 +51,11 @@ def get_data_from_json(file_number, oCC):
cacert =None cacert =None
if conf.cacert_file(): if conf.cacert_file():
cacert = conf.cacert_file() cacert = conf.cacert_file()
json_data = search(file_number, sources, proxies=proxies, verify=cacert, json_data = search(file_number, sources, proxies=proxies, verify=cacert,
dbsite=javdb_site, dbcookies=javdb_cookies, dbsite=javdb_site, dbcookies=javdb_cookies,
morestoryline=conf.is_storyline()) morestoryline=conf.is_storyline(),
specifiedSource=specified_source, specifiedUrl=specified_url)
# Return if data not found in all sources # Return if data not found in all sources
if not json_data: if not json_data:
print('[-]Movie Number not found!') print('[-]Movie Number not found!')

View File

@@ -1,3 +1,3 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .api import search from .api import search, getSupportedSources

View File

@@ -8,6 +8,9 @@ from .javbus import Javbus
class Airav(Parser): class Airav(Parser):
source = 'airav' source = 'airav'
# for javbus
specifiedSource = None
addtion_Javbus = True
expr_title = '/html/head/title/text()' expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()' expr_number = '/html/head/title/text()'
@@ -21,23 +24,38 @@ class Airav(Parser):
def search(self, number): def search(self, number):
self.number = number self.number = number
self.detailurl = 'https://cn.airav.wiki/video/' + number if self.specifiedUrl:
engine = Javbus() self.detailurl = self.specifiedUrl
javbusinfo = engine.scrape(number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else: else:
self.javbus = json.loads(javbusinfo) self.detailurl = self.queryNumberUrl(self.number)
if self.addtion_Javbus:
engine = Javbus()
javbusinfo = engine.scrape(self.number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:
self.javbus = json.loads(javbusinfo)
self.htmlcode = self.getHtml(self.detailurl) self.htmlcode = self.getHtml(self.detailurl)
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree) result = self.dictformat(htmltree)
return result return result
def queryNumberUrl(self, number):
queryUrl = "https://cn.airav.wiki/?search=" + number
queryTree = self.getHtmlTree(queryUrl)
results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a')
for i in results:
num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()')
if num.replace('-','') == number.replace('-','').upper():
self.number = num
return "https://cn.airav.wiki" + i.attrib['href']
return 'https://cn.airav.wiki/video/' + number
def getNum(self, htmltree): def getNum(self, htmltree):
# return super().getNum(htmltree) if self.addtion_Javbus:
result = self.javbus.get('number') result = self.javbus.get('number')
if isinstance(result, str) and len(result): if isinstance(result, str) and len(result):
return result return result
number = super().getNum(htmltree) number = super().getNum(htmltree)
result = str(re.findall('^\[(.*?)]', number)[0]) result = str(re.findall('^\[(.*?)]', number)[0])
return result return result
@@ -48,24 +66,27 @@ class Airav(Parser):
return result return result
def getStudio(self, htmltree): def getStudio(self, htmltree):
result = self.javbus.get('studio') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('studio')
return result if isinstance(result, str) and len(result):
return result
return super().getStudio(htmltree) return super().getStudio(htmltree)
def getRelease(self, htmltree): def getRelease(self, htmltree):
result = self.javbus.get('release') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('release')
return result if isinstance(result, str) and len(result):
return result
try: try:
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group() return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
except: except:
return '' return ''
def getYear(self, htmltree): def getYear(self, htmltree):
result = self.javbus.get('year') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('year')
return result if isinstance(result, str) and len(result):
return result
release = self.getRelease(htmltree) release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']") return str(re.findall('\d{4}', release)).strip(" ['']")
@@ -73,39 +94,40 @@ class Airav(Parser):
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip() return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree): def getRuntime(self, htmltree):
result = self.javbus.get('runtime') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('runtime')
return result if isinstance(result, str) and len(result):
return result
return '' return ''
def getDirector(self, htmltree): def getDirector(self, htmltree):
result = self.javbus.get('director') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('director')
return result if isinstance(result, str) and len(result):
return result
return '' return ''
def getActors(self, htmltree): def getActors(self, htmltree):
b=[]
a = super().getActors(htmltree) a = super().getActors(htmltree)
for v in a: b = [ i.strip() for i in a if len(i)]
v = v.strip()
if len(v):
b.append(v)
if len(b): if len(b):
return b return b
result = self.javbus.get('actor') if self.addtion_Javbus:
if isinstance(result, list) and len(result): result = self.javbus.get('actor')
return result if isinstance(result, list) and len(result):
return result
return [] return []
def getCover(self, htmltree): def getCover(self, htmltree):
result = self.javbus.get('cover') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('cover')
return result if isinstance(result, str) and len(result):
return result
return super().getCover(htmltree) return super().getCover(htmltree)
def getSeries(self, htmltree): def getSeries(self, htmltree):
result = self.javbus.get('series') if self.addtion_Javbus:
if isinstance(result, str) and len(result): result = self.javbus.get('series')
return result if isinstance(result, str) and len(result):
return result
return '' return ''

View File

@@ -18,29 +18,45 @@ from .mgstage import Mgstage
from .javbus import Javbus from .javbus import Javbus
from .xcity import Xcity from .xcity import Xcity
from .avsox import Avsox from .avsox import Avsox
from .javlibrary import Javlibrary
from .tmdb import Tmdb from .tmdb import Tmdb
from .imdb import Imdb
def search(number, sources: str=None, proxies=None, verify=None, type='adult', def search(number, sources: str=None, proxies=None, verify=None, type='adult',
specifiedSource=None, specifiedUrl=None,
dbcookies=None, dbsite=None, morestoryline=False): dbcookies=None, dbsite=None, morestoryline=False):
""" 根据``番号/电影``名搜索信息 """ 根据`番号/电影`名搜索信息
:param number: number/name depends on type :param number: number/name depends on type
:param sources: sources string with `,` like ``avsox,javbus`` :param sources: sources string with `,` Eg: `avsox,javbus`
:param type: ``adult``, ``general`` :param type: `adult`, `general`
""" """
sc = Scraping() sc = Scraping()
return sc.search(number, sources, proxies=proxies, verify=verify, type=type, return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
specifiedSource=specifiedSource, specifiedUrl=specifiedUrl,
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline) dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
def getSupportedSources(tag='adult'):
"""
:param tag: `adult`, `general`
"""
sc = Scraping()
if tag == 'adult':
return ','.join(sc.adult_full_sources)
else:
return ','.join(sc.general_full_sources)
class Scraping(): class Scraping():
""" """
""" """
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', 'getchu', 'gcolle'
'gcolle', 'javdb', 'getchu'] ]
adult_func_mapping = { adult_func_mapping = {
'avsox': Avsox().scrape, 'avsox': Avsox().scrape,
'javbus': Javbus().scrape, 'javbus': Javbus().scrape,
@@ -57,15 +73,19 @@ class Scraping():
'gcolle': Gcolle().scrape, 'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape, 'javdb': Javdb().scrape,
'getchu': Getchu().scrape, 'getchu': Getchu().scrape,
'javlibrary': Javlibrary().scrape,
} }
general_full_sources = ['tmdb'] general_full_sources = ['tmdb','imdb']
general_func_mapping = { general_func_mapping = {
'tmdb': Tmdb().scrape, 'tmdb': Tmdb().scrape,
'imdb': Imdb().scrape,
} }
proxies = None proxies = None
verify = None verify = None
specifiedSource = None
specifiedUrl = None
dbcookies = None dbcookies = None
dbsite = None dbsite = None
@@ -73,9 +93,12 @@ class Scraping():
morestoryline = False morestoryline = False
def search(self, number, sources=None, proxies=None, verify=None, type='adult', def search(self, number, sources=None, proxies=None, verify=None, type='adult',
specifiedSource=None, specifiedUrl=None,
dbcookies=None, dbsite=None, morestoryline=False): dbcookies=None, dbsite=None, morestoryline=False):
self.proxies = proxies self.proxies = proxies
self.verify = verify self.verify = verify
self.specifiedSource = specifiedSource
self.specifiedUrl = specifiedUrl
self.dbcookies = dbcookies self.dbcookies = dbcookies
self.dbsite = dbsite self.dbsite = dbsite
self.morestoryline = morestoryline self.morestoryline = morestoryline
@@ -88,7 +111,10 @@ class Scraping():
""" 查询电影电视剧 """ 查询电影电视剧
imdb,tmdb imdb,tmdb
""" """
sources = self.checkGeneralSources(sources, name) if self.specifiedSource:
sources = [self.specifiedSource]
else:
sources = self.checkGeneralSources(sources, name)
json_data = {} json_data = {}
for source in sources: for source in sources:
try: try:
@@ -116,7 +142,10 @@ class Scraping():
return json_data return json_data
def searchAdult(self, number, sources): def searchAdult(self, number, sources):
sources = self.checkAdultSources(sources, number) if self.specifiedSource:
sources = [self.specifiedSource]
else:
sources = self.checkAdultSources(sources, number)
json_data = {} json_data = {}
for source in sources: for source in sources:
try: try:

View File

@@ -50,16 +50,20 @@ class Avsox(Parser):
def getSmallCover(self, htmltree): def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图 """ 使用搜索页面的预览小图
""" """
return self.getTreeElement(self.searchtree, self.expr_smallcover) try:
return self.getTreeElement(self.searchtree, self.expr_smallcover)
except:
self.imagecut = 1
return ''
def getTags(self, htmltree): def getTags(self, htmltree):
tags = self.getTreeElement(htmltree).split(',') tags = self.getTreeElement(htmltree, self.expr_tags).split(',')
return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number) return getStoryline(self.number, proxies=self.proxies, verify=self.verify)
return '' return ''
def getActors(self, htmltree): def getActors(self, htmltree):

View File

@@ -22,7 +22,10 @@ class Carib(Parser):
def search(self, number): def search(self, number):
self.number = number self.number = number
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
htmlcode = self.getHtml(self.detailurl) htmlcode = self.getHtml(self.detailurl)
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
return 404 return 404
@@ -87,9 +90,11 @@ class Carib(Parser):
return o return o
def getOutline(self, htmltree): def getOutline(self, htmltree):
from .storyline import getStoryline if self.morestoryline:
result = getStoryline(self.number, uncensored=self.uncensored) from .storyline import getStoryline
if len(result): result = getStoryline(self.number, uncensored=self.uncensored,
return result proxies=self.proxies, verify=self.verify)
if len(result):
return result
return super().getOutline(htmltree) return super().getOutline(htmltree)

View File

@@ -29,7 +29,12 @@ class Dlsite(Parser):
def search(self, number): def search(self, number):
self.cookies = {'locale': 'zh-cn'} self.cookies = {'locale': 'zh-cn'}
if "RJ" in number or "VJ" in number: if self.specifiedUrl:
self.detailurl = self.specifiedUrl
# TODO 应该从页面内获取 number
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
htmltree = self.getHtmlTree(self.detailurl)
elif "RJ" in number or "VJ" in number:
self.number = number.upper() self.number = number.upper()
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
htmltree = self.getHtmlTree(self.detailurl) htmltree = self.getHtmlTree(self.detailurl)

View File

@@ -11,15 +11,21 @@ class Fanza(Parser):
expr_title = '//*[starts-with(@id, "title")]/text()' expr_title = '//*[starts-with(@id, "title")]/text()'
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
expr_cover = '//head/meta[@property="og:image"]' expr_cover = './/head/meta[@property="og:image"]/@content'
expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_extrafanart = '//a[@name="sample-image"]/img/@src'
expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline = "//div[@class='mg-b20 lh4']/text()"
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_outline_og = '//head/meta[@property="og:description"]' expr_outline_og = '//head/meta[@property="og:description"]/@content'
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number): def search(self, number):
self.number = number self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
durl = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
self.htmltree = self.getHtmlTree(durl)
result = self.dictformat(self.htmltree)
return result
# fanza allow letter + number + underscore, normalize the input here # fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789 # @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number fanza_search_number = number
@@ -75,7 +81,7 @@ class Fanza(Parser):
if result == '': if result == '':
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
if "※ 配信方法によって収録内容が異なる場合があります。" == result: if "※ 配信方法によって収録内容が異なる場合があります。" == result:
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content') result = self.getTreeElement(htmltree, self.expr_outline_og)
return result return result
except: except:
return '' return ''
@@ -99,9 +105,6 @@ class Fanza(Parser):
result = self.getFanzaString('配信開始日:') result = self.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n') return result.replace("/", "-").strip('\\n')
def getCover(self, htmltree):
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getFanzaStrings('ジャンル:') return self.getFanzaStrings('ジャンル:')

View File

@@ -22,8 +22,11 @@ class Fc2(Parser):
expr_tags = "//a[@class='tag tagTag']/text()" expr_tags = "//a[@class='tag tagTag']/text()"
def search(self, number): def search(self, number):
self.number = number.replace('FC2-', '').replace('fc2-', '') self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
self.htmlcode = self.getHtml(self.detailurl) self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404: if self.htmlcode == 404:
return 404 return 404

View File

@@ -2,7 +2,7 @@
import re import re
from lxml import etree from lxml import etree
from .httprequest import get_html_session from .httprequest import request_session
from .parser import Parser from .parser import Parser
@@ -27,9 +27,12 @@ class Gcolle(Parser):
def search(self, number): def search(self, number):
self.number = number.upper().replace('GCOLLE-','') self.number = number.upper().replace('GCOLLE-','')
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number if self.specifiedUrl:
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) self.detailurl = self.specifiedUrl
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + self.number).text else:
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get(self.detailurl).text
htmltree = etree.HTML(htmlcode) htmltree = etree.HTML(htmlcode)
r18url = self.getTreeElement(htmltree, self.expr_r18) r18url = self.getTreeElement(htmltree, self.expr_r18)

View File

@@ -35,7 +35,7 @@ class wwwGetchu(Parser):
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()' expr_title = '//*[@id="soft-title"]/text()'
expr_cover = '//head/meta[@property="og:image"]' expr_cover = '//head/meta[@property="og:image"]/@content'
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
@@ -67,9 +67,6 @@ class wwwGetchu(Parser):
def getNum(self, htmltree): def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0] return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
def getCover(self, htmltree):
return self.getTreeElement(htmltree, self.expr_cover).get('content')
def getActors(self, htmltree): def getActors(self, htmltree):
return super().getDirector(htmltree) return super().getDirector(htmltree)

View File

@@ -9,8 +9,9 @@ from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10 G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
""" """
网页请求核心函数 网页请求核心函数
@@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
raise Exception('Connect Failed') raise Exception('Connect Failed')
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None, def post(url: str, data: dict=None, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
""" """
是否使用代理应由上层处理 是否使用代理应由上层处理
""" """
@@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_
raise Exception('Connect Failed') raise Exception('Connect Failed')
#
# TODO: 以下临时使用,更新完各站后,再更新
#
class TimeoutHTTPAdapter(HTTPAdapter): class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT self.timeout = G_DEFAULT_TIMEOUT
@@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter):
return super().send(request, **kwargs) return super().send(request, **kwargs)
# with keep-alive feature def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
# storyline carib gcolle javdb only """
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None, keep-alive
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): """
session = requests.Session() session = requests.Session()
retries = Retry(total=retry, connect=retry, backoff_factor=1, retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504]) status_forcelist=[429, 500, 502, 503, 504])
@@ -110,67 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ
if proxies: if proxies:
session.proxies = proxies session.proxies = proxies
session.headers = {"User-Agent": ua or G_USER_AGENT} session.headers = {"User-Agent": ua or G_USER_AGENT}
try: return session
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
# storyline only
# 使用 cloudscraper....
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, use_scraper: bool = False,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
if isinstance(url, str) and len(url):
result = browser.open(url)
else:
return browser
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "browser":
return result, browser
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_browser() Failed! {e}')
return None
# storyline xcity only # storyline xcity only
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,

24
scrapinglib/imdb.py Normal file
View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from .parser import Parser
class Imdb(Parser):
source = 'imdb'
imagecut = 0
expr_title = '//h1[@data-testid="hero-title-block__title"]/text()'
expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()'
expr_cover = '//head/meta[@property="og:image"]/@content'
expr_outline = '//head/meta[@property="og:description"]/@content'
expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()'
expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()'
def queryNumberUrl(self, number):
"""
TODO 区分 ID 与 名称
"""
id = number
movieUrl = "https://www.imdb.com/title/" + id
return movieUrl

View File

@@ -26,6 +26,14 @@ class Jav321(Parser):
return 'https://www.jav321.com/search' return 'https://www.jav321.com/search'
def getHtmlTree(self, url): def getHtmlTree(self, url):
"""
特殊处理 仅获取页面调用一次
"""
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
resp = httprequest.get(self.detailurl, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailhtml = resp
return etree.fromstring(resp, etree.HTMLParser())
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify) resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if "/video/" in resp.url: if "/video/" in resp.url:
self.detailurl = resp.url self.detailurl = resp.url

View File

@@ -33,6 +33,11 @@ class Javbus(Parser):
def search(self, number): def search(self, number):
self.number = number self.number = number
try: try:
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
return result
url = "https://www." + secrets.choice([ url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun', 'cdnbus.fun',
@@ -61,7 +66,10 @@ class Javbus(Parser):
self.uncensored = True self.uncensored = True
w_number = number.replace('.', '-') w_number = number.replace('.', '-')
self.detailurl = 'https://www.javbus.red/' + w_number if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://www.javbus.red/' + w_number
self.htmlcode = self.getHtml(self.detailurl) self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404: if self.htmlcode == 404:
return 404 return 404
@@ -128,5 +136,6 @@ class Javbus(Parser):
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度 return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored) return getStoryline(self.number , uncensored = self.uncensored,
proxies=self.proxies, verify=self.verify)
return '' return ''

View File

@@ -4,7 +4,7 @@
import re import re
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree from lxml import etree
from .httprequest import get_html_session from .httprequest import request_session
from .parser import Parser from .parser import Parser
@@ -63,8 +63,11 @@ class Javdb(Parser):
def search(self, number: str): def search(self, number: str):
self.number = number self.number = number
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailurl = self.queryNumberUrl(number) if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(number)
self.deatilpage = self.session.get(self.detailurl).text self.deatilpage = self.session.get(self.detailurl).text
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage: if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
self.noauth = True self.noauth = True
@@ -173,7 +176,8 @@ class Javdb(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree)) return getStoryline(self.number, self.getUncensored(htmltree),
proxies=self.proxies, verify=self.verify)
return '' return ''
def getTrailer(self, htmltree): def getTrailer(self, htmltree):
@@ -193,19 +197,19 @@ class Javdb(Parser):
def getUserRating(self, htmltree): def getUserRating(self, htmltree):
try: try:
result = str(self.getTreeElement(htmltree, self.expr_userrating)) numstrs = self.getTreeElement(htmltree, self.expr_userrating)
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) nums = re.findall('[0-9.]+', numstrs)
return float(v[0][0]) return float(nums[0])
except: except:
return return ''
def getUserVotes(self, htmltree): def getUserVotes(self, htmltree):
try: try:
result = str(self.getTreeElement(htmltree, self.expr_uservotes)) result = self.getTreeElement(htmltree, self.expr_uservotes)
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) v = re.findall('[0-9.]+', result)
return int(v[0][1]) return int(v[1])
except: except:
return return ''
def getaphoto(self, url, session): def getaphoto(self, url, session):
html_page = session.get(url).text html_page = session.get(url).text

81
scrapinglib/javlibrary.py Normal file
View File

@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
from lxml import etree
from .httprequest import request_session
from .parser import Parser
class Javlibrary(Parser):
source = 'javlibrary'
htmltree = None
expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()'
expr_title = '//div[@id="video_title"]/h3/a/text()'
expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()'
expr_tags = '//div[@id="video_genres"]/table/tr/td[@class="text"]/span/a/text()'
expr_cover = '//img[@id="video_jacket_img"]/@src'
expr_release = '//div[@id="video_date"]/table/tr/td[@class="text"]/text()'
expr_studio = '//div[@id="video_maker"]/table/tr/td[@class="text"]/span/a/text()'
expr_runtime = '//div[@id="video_length"]/table/tr/td/span[@class="text"]/text()'
expr_userrating = '//div[@id="video_review"]/table/tr/td/span[@class="score"]/text()'
expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()'
expr_extrafanart = '//div[@class="previewthumbs"]/img/@src'
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
self.cookies = {'over18':'1'}
def search(self, number):
self.number = number.upper()
self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(self.number)
if not self.detailurl:
return 404
if self.htmltree is None:
deatils = self.session.get(self.detailurl)
self.htmltree = etree.fromstring(deatils.text, etree.HTMLParser())
result = self.dictformat(self.htmltree)
return result
def queryNumberUrl(self, number:str):
queryUrl = "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + number
queryResult = self.session.get(queryUrl)
if queryResult and "/?v=jav" in queryResult.url:
self.htmltree = etree.fromstring(queryResult.text, etree.HTMLParser())
return queryResult.url
else:
queryTree = etree.fromstring(queryResult.text, etree.HTMLParser())
numbers = queryTree.xpath('//div[@class="id"]/text()')
if number in numbers:
urls = queryTree.xpath('//div[@class="id"]/../@href')
detailurl = urls[numbers.index(number)]
return "http://www.javlibrary.com/cn" + detailurl.strip('.')
return None
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = title.replace(self.getNum(htmltree), '').strip()
return title
def getCover(self, htmltree):
url = super().getCover(htmltree)
if not url.startswith('http'):
url = 'https:' + url
return url
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree),
proxies=self.proxies, verify=self.verify)
return ''

View File

@@ -8,6 +8,7 @@ from .parser import Parser
class Madou(Parser): class Madou(Parser):
source = 'madou' source = 'madou'
imagecut = 0
uncensored = True uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url' expr_url = '//a[@class="share-weixin"]/@data-url'
@@ -17,7 +18,10 @@ class Madou(Parser):
def search(self, number): def search(self, number):
self.number = number.lower().strip() self.number = number.lower().strip()
self.detailurl = "https://madou.club/" + number + ".html" if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = "https://madou.club/" + number + ".html"
self.htmlcode = self.getHtml(self.detailurl) self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404: if self.htmlcode == 404:
return 404 return 404
@@ -59,5 +63,5 @@ class Madou(Parser):
def getTags(self, htmltree): def getTags(self, htmltree):
studio = self.getStudio(htmltree) studio = self.getStudio(htmltree)
x = super().getTags(htmltree).split(',') x = super().getTags(htmltree)
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]

View File

@@ -25,7 +25,10 @@ class Mgstage(Parser):
def search(self, number): def search(self, number):
self.number = number.upper() self.number = number.upper()
self.cookies = {'adc':'1'} self.cookies = {'adc':'1'}
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
htmltree =self.getHtmlTree(self.detailurl) htmltree =self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree) result = self.dictformat(htmltree)
return result return result

View File

@@ -8,6 +8,8 @@ from .parser import Parser
class Mv91(Parser): class Mv91(Parser):
source = 'mv91' source = 'mv91'
imagecut = 0
uncensored = True
expr_number = '//div[@class="player-title"]/text()' expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()' expr_title = '//div[@class="player-title"]/text()'
@@ -53,8 +55,8 @@ class Mv91(Parser):
result = str(finds[0][0]) result = str(finds[0][0])
else: else:
result = ' '.join(title.replace('/',' ').split()) result = ' '.join(title.replace('/',' ').split())
result = result.split()[0].replace('「预告」','') result = result.split()[0]
return result.strip() return result.replace('「预告」','').strip('/ ')
except: except:
return '' return ''

View File

@@ -11,7 +11,10 @@ class Parser:
""" 基础刮削类 """ 基础刮削类
""" """
source = 'base' source = 'base'
# poster: `0` 复制 `1` 裁剪 # 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
imagecut = 1 imagecut = 1
uncensored = False uncensored = False
allow_number_change = False allow_number_change = False
@@ -21,6 +24,7 @@ class Parser:
extraheader = None extraheader = None
cookies = None cookies = None
morestoryline = False morestoryline = False
specifiedUrl = None
number = '' number = ''
detailurl = '' detailurl = ''
@@ -61,8 +65,19 @@ class Parser:
return result return result
def search(self, number): def search(self, number):
""" 查询番号
查询主要流程:
1. 获取 url
2. 获取详情页面
3. 解析
4. 返回 result
"""
self.number = number self.number = number
self.detailurl = self.queryNumberUrl(number) if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl) htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree) result = self.dictformat(htmltree)
return result return result
@@ -73,19 +88,24 @@ class Parser:
针对需要传递的参数: cookies, proxy等 针对需要传递的参数: cookies, proxy等
子类继承后修改 子类继承后修改
""" """
if not core:
return
if core.proxies: if core.proxies:
self.proxies = core.proxies self.proxies = core.proxies
if core.verify: if core.verify:
self.verify = core.verify self.verify = core.verify
if core.morestoryline: if core.morestoryline:
self.morestoryline = True self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
def queryNumberUrl(self, number): def queryNumberUrl(self, number):
""" 根据番号查询详细信息url """ 根据番号查询详细信息url
需要针对不同站点修改,或者在上层直接获取
备份查询页面,预览图可能需要 备份查询页面,预览图可能需要
""" """
url = httprequest.get(number) url = "http://detailurl.ai/" + number
return url return url
def getHtml(self, url, type = None): def getHtml(self, url, type = None):
@@ -115,26 +135,26 @@ class Parser:
'number': self.getNum(htmltree), 'number': self.getNum(htmltree),
'title': self.getTitle(htmltree), 'title': self.getTitle(htmltree),
'studio': self.getStudio(htmltree), 'studio': self.getStudio(htmltree),
'release': self.getRelease(htmltree),
'year': self.getYear(htmltree), 'year': self.getYear(htmltree),
'outline': self.getOutline(htmltree), 'outline': self.getOutline(htmltree),
'runtime': self.getRuntime(htmltree), 'runtime': self.getRuntime(htmltree),
'director': self.getDirector(htmltree), 'director': self.getDirector(htmltree),
'actor': self.getActors(htmltree), 'actor': self.getActors(htmltree),
'release': self.getRelease(htmltree), 'actor_photo': self.getActorPhoto(htmltree),
'cover': self.getCover(htmltree), 'cover': self.getCover(htmltree),
'cover_small': self.getSmallCover(htmltree), 'cover_small': self.getSmallCover(htmltree),
'extrafanart': self.getExtrafanart(htmltree), 'extrafanart': self.getExtrafanart(htmltree),
'trailer': self.getTrailer(htmltree), 'trailer': self.getTrailer(htmltree),
'imagecut': self.imagecut,
'tag': self.getTags(htmltree), 'tag': self.getTags(htmltree),
'label': self.getLabel(htmltree), 'label': self.getLabel(htmltree),
'actor_photo': self.getActorPhoto(htmltree), 'series': self.getSeries(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree),
'uncensored': self.getUncensored(htmltree),
'website': self.detailurl, 'website': self.detailurl,
'source': self.source, 'source': self.source,
'series': self.getSeries(htmltree), 'imagecut': self.getImagecut(htmltree),
'uncensored': self.getUncensored(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree)
} }
dic = self.extradict(dic) dic = self.extradict(dic)
except Exception as e: except Exception as e:
@@ -215,11 +235,26 @@ class Parser:
else: else:
return self.uncensored return self.uncensored
def getImagecut(self, htmlree):
""" 修正 无码poster不裁剪cover
"""
if self.imagecut == 1 and self.getUncensored(htmlree):
self.imagecut = 0
return self.imagecut
def getUserRating(self, htmltree): def getUserRating(self, htmltree):
return self.getTreeElement(htmltree, self.expr_userrating) numstrs = self.getTreeElement(htmltree, self.expr_userrating)
nums = re.findall('[0-9.]+', numstrs)
if len(nums) == 1:
return float(nums[0])
return ''
def getUserVotes(self, htmltree): def getUserVotes(self, htmltree):
return self.getTreeElement(htmltree, self.expr_uservotes) votestrs = self.getTreeElement(htmltree, self.expr_uservotes)
votes = re.findall('[0-9]+', votestrs)
if len(votes) == 1:
return int(votes[0])
return ''
def getTreeElement(self, tree: html.HtmlElement, expr, index=0): def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0

View File

@@ -5,6 +5,7 @@
""" """
import json
import os import os
import re import re
import time import time
@@ -13,7 +14,10 @@ import builtins
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml.html import fromstring from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
from scrapinglib.airav import Airav
from scrapinglib.xcity import Xcity
from .httprequest import get_html_by_form, get_html_by_scraper, request_session
# 舍弃 Amazon 源 # 舍弃 Amazon 源
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
@@ -35,7 +39,7 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title = None, sites: list=None, uncensored=None): def getStoryline(number, title=None, sites: list=None, uncensored=None, proxies=None, verify=None):
start_time = time.time() start_time = time.time()
debug = False debug = False
storyine_sites = "1:avno1,4:airavwiki".split(',') storyine_sites = "1:avno1,4:airavwiki".split(',')
@@ -52,7 +56,7 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None):
r_dup.add(ns) r_dup.add(ns)
sort_sites.sort() sort_sites.sort()
apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites] apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites]
mp_args = ((site, number, title, debug) for site in apply_sites) mp_args = ((site, number, title, debug, proxies, verify) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count()) cores = min(len(apply_sites), os.cpu_count())
if cores == 0: if cores == 0:
return '' return ''
@@ -79,24 +83,21 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None):
def getStoryline_mp(args): def getStoryline_mp(args):
(site, number, title, debug) = args (site, number, title, debug, proxies, verify) = args
start_time = time.time() start_time = time.time()
storyline = None storyline = None
if not isinstance(site, str): if not isinstance(site, str):
return storyline return storyline
elif site == "airavwiki": elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug) storyline = getStoryline_airavwiki(number, debug, proxies, verify)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav": elif site == "airav":
storyline = getStoryline_airav(number, debug) storyline = getStoryline_airav(number, debug, proxies, verify)
elif site == "avno1": elif site == "avno1":
storyline = getStoryline_avno1(number, debug) storyline = getStoryline_avno1(number, debug, proxies, verify)
elif site == "xcity": elif site == "xcity":
storyline = getStoryline_xcity(number, debug) storyline = getStoryline_xcity(number, debug, proxies, verify)
# elif site == "amazon":
# storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo": elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug) storyline = getStoryline_58avgo(number, debug, proxies, verify)
if not debug: if not debug:
return storyline return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
@@ -108,11 +109,12 @@ def getStoryline_mp(args):
return storyline return storyline
def getStoryline_airav(number, debug): def getStoryline_airav(number, debug, proxies, verify):
try: try:
site = secrets.choice(('airav.cc','airav4.club')) site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, session = get_html_session(url, return_type='session') session = request_session(proxies=proxies, verify=verify)
res = session.get(url)
if not res: if not res:
raise ValueError(f"get_html_by_session('{url}') failed") raise ValueError(f"get_html_by_session('{url}') failed")
lx = fromstring(res.text) lx = fromstring(res.text)
@@ -142,36 +144,16 @@ def getStoryline_airav(number, debug):
return None return None
def getStoryline_airavwiki(number, debug): def getStoryline_airavwiki(number, debug, proxies, verify):
try: try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://cn.airav.wiki/?search={kwd}' airavwiki = Airav()
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True) airavwiki.addtion_Javbus = False
if not result.ok: airavwiki.proxies = proxies
raise ValueError(f"get_html_by_browser('{url}','{number}') failed") airavwiki.verify = verify
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block') jsons = airavwiki.search(kwd)
link = None outline = json.loads(jsons).get('outline')
for a in s: return outline
title = a.img['title']
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].") print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
@@ -179,7 +161,7 @@ def getStoryline_airavwiki(number, debug):
return '' return ''
def getStoryline_58avgo(number, debug): def getStoryline_58avgo(number, debug, proxies, verify):
try: try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
@@ -188,6 +170,7 @@ def getStoryline_58avgo(number, debug):
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
proxies=proxies, verify=verify,
return_type = 'browser') return_type = 'browser')
if not result: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
@@ -218,13 +201,13 @@ def getStoryline_58avgo(number, debug):
return '' return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得
try: try:
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc', site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
'hotav.biz','iqq2.xyz','javhq.tv', 'hotav.biz','iqq2.xyz','javhq.tv',
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',]) 'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}' url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
lx = fromstring(get_html_by_scraper(url)) lx = fromstring(get_html_by_scraper(url, proxies=proxies, verify=verify))
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description') descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs): if not descs or not len(descs):
@@ -245,7 +228,7 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
return '' return ''
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1OLD(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得
try: try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
@@ -254,6 +237,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form', form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number}, fields = {'kw' : number},
proxies=proxies, verify=verify,
return_type = 'browser') return_type = 'browser')
if not result: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
@@ -271,19 +255,14 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
return '' return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 def getStoryline_xcity(number, debug, proxies, verify): #获取剧情介绍 从xcity取得
try: try:
xcity_number = number.replace('-','') xcityEngine = Xcity()
query_result, browser = get_html_by_form( xcityEngine.proxies = proxies
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), xcityEngine.verify = verify
fields = {'q' : xcity_number.lower()}, jsons = xcityEngine.search(number)
return_type = 'browser') outline = json.loads(jsons).get('outline')
if not query_result or not query_result.ok: return outline
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")

View File

@@ -13,10 +13,10 @@ class Tmdb(Parser):
imagecut = 0 imagecut = 0
apikey = None apikey = None
expr_title = '//head/meta[@property="og:title"]' expr_title = '//head/meta[@property="og:title"]/@content'
expr_release = '//div/span[@class="release"]/text()' expr_release = '//div/span[@class="release"]/text()'
expr_cover = '//head/meta[@property="og:image"]' expr_cover = '//head/meta[@property="og:image"]/@content'
expr_outline = '//head/meta[@property="og:description"]' expr_outline = '//head/meta[@property="og:description"]/@content'
# def search(self, number): # def search(self, number):
# self.detailurl = self.queryNumberUrl(number) # self.detailurl = self.queryNumberUrl(number)
@@ -30,11 +30,6 @@ class Tmdb(Parser):
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN" movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
return movieUrl return movieUrl
def getTitle(self, htmltree):
return self.getTreeElement(htmltree, self.expr_title).get('content')
def getCover(self, htmltree): def getCover(self, htmltree):
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content') return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover)
def getOutline(self, htmltree):
return self.getTreeElement(htmltree, self.expr_outline).get('content')

View File

@@ -3,7 +3,6 @@
import re import re
import secrets import secrets
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_by_form from .httprequest import get_html_by_form
from .parser import Parser from .parser import Parser
@@ -13,6 +12,9 @@ class Xcity(Parser):
expr_number = '//*[@id="hinban"]/text()' expr_number = '//*[@id="hinban"]/text()'
expr_title = '//*[@id="program_detail_title"]/text()' expr_title = '//*[@id="program_detail_title"]/text()'
expr_actor = '//ul/li[@class="credit-links"]/a/text()'
expr_actor_link = '//ul/li[@class="credit-links"]/a'
expr_actorphoto = '//div[@class="frame"]/div/p/img/@src'
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()' expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()' expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()' expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
@@ -23,6 +25,20 @@ class Xcity(Parser):
expr_director = '//*[@id="program_detail_director"]/text()' expr_director = '//*[@id="program_detail_director"]/text()'
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
expr_extrafanart = '//div[@id="sample_images"]/div/a/@href'
expr_outline = '//head/meta[@property="og:description"]/@content'
def queryNumberUrl(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
cookies=self.cookies, proxies=self.proxies, verify=self.verify,
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
prelink = browser.links('avod\/detail')[0]['href']
return urljoin('https://xcity.jp', prelink)
def getStudio(self, htmltree): def getStudio(self, htmltree):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
@@ -51,65 +67,26 @@ class Xcity(Parser):
except: except:
return '' return ''
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, uncensored=False)
return ''
def getActors(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(self, htmltree): def getActorPhoto(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') treea = self.getTreeAll(htmltree, self.expr_actor_link)
t = {i.text.strip(): i['href'] for i in htmla} t = {i.text.strip(): i.attrib['href'] for i in treea}
o = {} o = {}
for k, v in t.items(): for k, v in t.items():
r = self.browser.open_relative(v) actorpageUrl = "https://xcity.jp" + v
if not r.ok: try:
continue adtree = self.getHtmlTree(actorpageUrl)
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') picUrl = self.getTreeElement(adtree, self.expr_actorphoto)
if 'noimage.gif' in pic['src']: if 'noimage.gif' in picUrl:
continue continue
o[k] = urljoin(self.browser.url, pic['src']) o[k] = urljoin("https://xcity.jp", picUrl)
except:
pass
return o return o
def getExtrafanart(self, htmltree): def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>') arts = self.getTreeAll(htmltree, self.expr_extrafanart)
html = html_pather.search(self.detail_page) extrafanart = []
if html: for i in arts:
html = html.group() i = "https:" + i
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"') extrafanart.append(i)
extrafanart_imgs = extrafanart_pather.findall(html) return extrafanart
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
def open_by_browser(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def search(self, number):
self.number = number
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
result = self.dictformat(lx)
return result