Files
AV_Data_Capture/scrapinglib/api.py
2023-05-19 02:25:23 +08:00

271 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
import re
import json
from .parser import Parser
import config
import importlib
def search(number, sources: str = None, **kwargs):
""" 根据`番号/电影`名搜索信息
:param number: number/name depends on type
:param sources: sources string with `,` Eg: `avsox,javbus`
:param type: `adult`, `general`
"""
sc = Scraping()
return sc.search(number, sources, **kwargs)
def getSupportedSources(tag='adult'):
"""
:param tag: `adult`, `general`
"""
sc = Scraping()
if tag == 'adult':
return ','.join(sc.adult_full_sources)
else:
return ','.join(sc.general_full_sources)
class Scraping:
"""
"""
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu', 'caribpr'
]
general_full_sources = ['tmdb', 'imdb']
debug = False
proxies = None
verify = None
specifiedSource = None
specifiedUrl = None
dbcookies = None
dbsite = None
# 使用storyline方法进一步获取故事情节
morestoryline = False
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
specifiedSource=None, specifiedUrl=None,
dbcookies=None, dbsite=None, morestoryline=False,
debug=False):
self.debug = debug
self.proxies = proxies
self.verify = verify
self.specifiedSource = specifiedSource
self.specifiedUrl = specifiedUrl
self.dbcookies = dbcookies
self.dbsite = dbsite
self.morestoryline = morestoryline
if type == 'adult':
return self.searchAdult(number, sources)
else:
return self.searchGeneral(number, sources)
def searchGeneral(self, name, sources):
""" 查询电影电视剧
imdb,tmdb
"""
if self.specifiedSource:
sources = [self.specifiedSource]
else:
sources = self.checkGeneralSources(sources, name)
json_data = {}
for source in sources:
try:
if self.debug:
print('[+]select', source)
try:
module = importlib.import_module('.' + source, 'scrapinglib')
parser_type = getattr(module, source.capitalize())
parser: Parser = parser_type()
data = parser.scrape(name, self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
if config.getInstance().debug():
print(e)
# if any service return a valid return, break
if self.get_data_state(json_data):
if self.debug:
print(f"[+]Find movie [{name}] metadata on website '{source}'")
break
except:
continue
# Return if data not found in all sources
if not json_data or json_data['title'] == "":
return None
# If actor is anonymous, Fill in Anonymous
if len(json_data['actor']) == 0:
if config.getInstance().anonymous_fill() == True:
if "zh_" in config.getInstance().get_target_language():
json_data['actor'] = "佚名"
else:
json_data['actor'] = "Anonymous"
return json_data
def searchAdult(self, number, sources):
if self.specifiedSource:
sources = [self.specifiedSource]
elif type(sources) is list:
pass
else:
sources = self.checkAdultSources(sources, number)
json_data = {}
for source in sources:
try:
if self.debug:
print('[+]select', source)
try:
module = importlib.import_module('.' + source, 'scrapinglib')
parser_type = getattr(module, source.capitalize())
parser: Parser = parser_type()
data = parser.scrape(number, self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
if config.getInstance().debug():
print(e)
# json_data = self.func_mapping[source](number, self)
# if any service return a valid return, break
if self.get_data_state(json_data):
if self.debug:
print(f"[+]Find movie [{number}] metadata on website '{source}'")
break
except:
continue
# javdb的封面有水印如果可以用其他源的封面来替换javdb的封面
if 'source' in json_data and json_data['source'] == 'javdb':
# search other sources
other_sources = sources[sources.index('javdb') + 1:]
while other_sources:
# If cover not found in other source, then skip using other sources using javdb cover instead
try:
other_json_data = self.searchAdult(number, other_sources)
if other_json_data is not None and 'cover' in other_json_data and other_json_data['cover'] != '':
json_data['cover'] = other_json_data['cover']
if self.debug:
print(f"[+]Find movie [{number}] cover on website '{other_json_data['cover']}'")
break
# 当不知道source为何时只能停止搜索
if 'source' not in other_json_data:
break
# check other sources
other_sources = sources[sources.index(other_json_data['source']) + 1:]
except:
pass
# Return if data not found in all sources
if not json_data or json_data['title'] == "":
return None
# If actor is anonymous, Fill in Anonymous
if len(json_data['actor']) == 0:
if config.getInstance().anonymous_fill() == True:
if "zh_" in config.getInstance().get_target_language():
json_data['actor'] = "佚名"
else:
json_data['actor'] = "Anonymous"
return json_data
def checkGeneralSources(self, c_sources, name):
if not c_sources:
sources = self.general_full_sources
else:
sources = c_sources.split(',')
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.general_full_sources:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
return sources
def checkAdultSources(self, c_sources, file_number):
if not c_sources:
sources = self.adult_full_sources
else:
sources = c_sources.split(',')
def insert(sources, source):
if source in sources:
sources.insert(0, sources.pop(sources.index(source)))
return sources
if len(sources) <= len(self.adult_full_sources):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources, "carib")
elif "caribpr" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources, "caribpr")
elif "item" in file_number or "GETCHU" in file_number.upper():
sources = insert(sources, "getchu")
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+",
file_number):
sources = insert(sources, "getchu")
sources = insert(sources, "dlsite")
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources, "fc2")
elif "mgstage" in sources and \
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources, "mgstage")
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources, "gcolle")
elif "madou" in sources and (re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)):
sources = insert(sources, "madou")
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources:
sources = insert(sources, "avsox")
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources:
sources = insert(sources, "xcity")
if "madou" in sources:
sources = insert(sources, "madou")
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.adult_full_sources and config.getInstance().debug():
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
if config.getInstance().debug():
print('[!] Remove Source : ' + s)
sources.remove(d)
return sources
def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
if (data["cover"] is None or data["cover"] == "" or data["cover"] == "null") \
and (data["cover_small"] is None or data["cover_small"] == "" or
data["cover_small"] == "null"):
return False
return True