add scrapinglib
This commit is contained in:
173
scrapinglib/api.py
Normal file
173
scrapinglib/api.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
from scrapinglib.airav import Airav
|
||||
from scrapinglib.carib import Carib
|
||||
from scrapinglib.dlsite import Dlsite
|
||||
from scrapinglib.fanza import Fanza
|
||||
from scrapinglib.gcolle import Gcolle
|
||||
from scrapinglib.jav321 import Jav321
|
||||
from scrapinglib.javdb import Javdb
|
||||
from scrapinglib.mv91 import Mv91
|
||||
from .fc2 import Fc2
|
||||
from .madou import Madou
|
||||
from .mgstage import Mgstage
|
||||
from .javbus import Javbus
|
||||
from .xcity import Xcity
|
||||
from .avsox import Avsox
|
||||
|
||||
|
||||
def search(number, souces=None, proxies=None, dbcookies=None):
|
||||
"""
|
||||
TODO 支持更多网站 douban, imdb,tmdb anidb等
|
||||
type 区分 r18 与 normal
|
||||
"""
|
||||
sc = Scraping()
|
||||
return sc.search(number, souces, proxies=proxies, dbcookies=dbcookies)
|
||||
|
||||
|
||||
class Scraping():
|
||||
"""
|
||||
|
||||
只需要获得内容,不经修改
|
||||
|
||||
如果需要翻译等,再针对此方法封装一层
|
||||
不做 naming rule 处理,放到封装层,保持内部简介
|
||||
|
||||
可以指定刮削库,可查询当前支持的刮削库
|
||||
可查询演员多个艺名
|
||||
|
||||
参数:
|
||||
number
|
||||
cookies
|
||||
proxy
|
||||
sources
|
||||
TODO multi threading (加速是否会触发反爬?)
|
||||
|
||||
[x] translate
|
||||
[x] naming rule
|
||||
[x] convert: actress name/tags
|
||||
|
||||
"""
|
||||
|
||||
full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
|
||||
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
|
||||
'gcolle', 'javdb']
|
||||
func_mapping = {
|
||||
'avsox': Avsox().search,
|
||||
'javbus': Javbus().search,
|
||||
'xcity': Xcity().search,
|
||||
'mgstage': Mgstage().search,
|
||||
'madou': Madou().search,
|
||||
'fc2': Fc2().search,
|
||||
'dlsite': Dlsite().search,
|
||||
'jav321': Jav321().search,
|
||||
'fanza': Fanza().search,
|
||||
'airav': Airav().search,
|
||||
'carib': Carib().search,
|
||||
'mv91': Mv91().search,
|
||||
'gcolle': Gcolle().search,
|
||||
'javdb': Javdb().search,
|
||||
}
|
||||
|
||||
proxies = None
|
||||
dbcookies = None
|
||||
|
||||
def search(self, number, sources=None, proxies=None, dbcookies=None):
|
||||
self.proxies = proxies
|
||||
self.dbcookies = dbcookies
|
||||
|
||||
sources = self.checkSources(sources, number)
|
||||
json_data = {}
|
||||
for source in sources:
|
||||
try:
|
||||
print('[+]select', source)
|
||||
try:
|
||||
data = self.func_mapping[source](number, self)
|
||||
if data == 404:
|
||||
continue
|
||||
json_data = json.loads(data)
|
||||
except Exception as e:
|
||||
print('[!] 出错啦')
|
||||
print(e)
|
||||
# json_data = self.func_mapping[source](number, self)
|
||||
# if any service return a valid return, break
|
||||
if self.get_data_state(json_data):
|
||||
print(f"[+]Find movie [{number}] metadata on website '{source}'")
|
||||
break
|
||||
except:
|
||||
break
|
||||
|
||||
# Return if data not found in all sources
|
||||
if not json_data:
|
||||
print(f'[-]Movie Number [{number}] not found!')
|
||||
return None
|
||||
|
||||
return json_data
|
||||
|
||||
|
||||
def checkSources(self, c_sources, file_number):
|
||||
if not c_sources:
|
||||
c_sources = self.full_sources
|
||||
|
||||
sources = c_sources.split(',')
|
||||
def insert(sources,source):
|
||||
if source in sources:
|
||||
sources.insert(0, sources.pop(sources.index(source)))
|
||||
return sources
|
||||
|
||||
if len(sources) <= len(self.func_mapping):
|
||||
# if the input file name matches certain rules,
|
||||
# move some web service to the beginning of the list
|
||||
lo_file_number = file_number.lower()
|
||||
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
|
||||
):
|
||||
sources = insert(sources,"carib")
|
||||
elif "item" in file_number:
|
||||
sources = insert(sources,"getchu")
|
||||
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
||||
if "avsox" in sources:
|
||||
sources = insert(sources,"avsox")
|
||||
elif "mgstage" in sources and \
|
||||
(re.match(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
||||
sources = insert(sources,"mgstage")
|
||||
elif "fc2" in lo_file_number:
|
||||
if "fc2" in sources:
|
||||
sources = insert(sources,"fc2")
|
||||
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
||||
sources = insert(sources,"gcolle")
|
||||
elif "dlsite" in sources and (
|
||||
"rj" in lo_file_number or "vj" in lo_file_number
|
||||
):
|
||||
sources = insert(sources,"dlsite")
|
||||
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
|
||||
if "xcity" in sources:
|
||||
sources = insert(sources,"xcity")
|
||||
if "madou" in sources:
|
||||
sources = insert(sources,"madou")
|
||||
elif "madou" in sources and (
|
||||
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
||||
):
|
||||
sources = insert(sources,"madou")
|
||||
|
||||
# check sources in func_mapping
|
||||
todel = []
|
||||
for s in sources:
|
||||
if not s in self.func_mapping:
|
||||
print('[!] Source Not Exist : ' + s)
|
||||
todel.append(s)
|
||||
for d in todel:
|
||||
print('[!] Remove Source : ' + s)
|
||||
sources.remove(d)
|
||||
return sources
|
||||
|
||||
def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测
|
||||
if "title" not in data or "number" not in data:
|
||||
return False
|
||||
if data["title"] is None or data["title"] == "" or data["title"] == "null":
|
||||
return False
|
||||
if data["number"] is None or data["number"] == "" or data["number"] == "null":
|
||||
return False
|
||||
return True
|
||||
Reference in New Issue
Block a user