Merge pull request #993 from hejianjun/feature/自定义爬虫番号处理
Feature/自定义爬虫番号处理
This commit is contained in:
@@ -57,7 +57,7 @@ image_naming_with_number = 0
|
|||||||
update_check = 1
|
update_check = 1
|
||||||
|
|
||||||
[priority]
|
[priority]
|
||||||
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle
|
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle,javday,javmenu
|
||||||
|
|
||||||
[escape]
|
[escape]
|
||||||
literals = \()/
|
literals = \()/
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ def get_number(debug: bool, file_path: str) -> str:
|
|||||||
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv
|
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv
|
||||||
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
|
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
|
||||||
file_number = os.path.splitext(filename)
|
file_number = os.path.splitext(filename)
|
||||||
filename = re.search(r'\w+(-|_)\w+', filename, re.A)
|
filename = re.search(r'[\w\-_]+', filename, re.A)
|
||||||
if filename:
|
if filename:
|
||||||
file_number = str(filename.group())
|
file_number = str(filename.group())
|
||||||
else:
|
else:
|
||||||
@@ -85,34 +85,7 @@ def get_number(debug: bool, file_path: str) -> str:
|
|||||||
print(f'[-]Number Parser exception: {e} [{file_path}]')
|
print(f'[-]Number Parser exception: {e} [{file_path}]')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# modou提取number
|
|
||||||
def md(filename):
|
|
||||||
m = re.search(r'(md[a-z]{0,2}-?)(\d{2,})(-ep\d*|-\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(4)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def mmz(filename):
|
|
||||||
m = re.search(r'(mmz-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def msd(filename):
|
|
||||||
m = re.search(r'(msd-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def mky(filename):
|
|
||||||
m = re.search(r'(mky-[a-z]{2,2}-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def yk(filename):
|
|
||||||
m = re.search(r'(yk-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def pm(filename):
|
|
||||||
m = re.search(r'(pm[a-z]?-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
def fsog(filename):
|
|
||||||
m = re.search(r'(fsog-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
|
||||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
|
||||||
|
|
||||||
# 按javdb数据源的命名规范提取number
|
# 按javdb数据源的命名规范提取number
|
||||||
G_TAKE_NUM_RULES = {
|
G_TAKE_NUM_RULES = {
|
||||||
@@ -126,13 +99,6 @@ G_TAKE_NUM_RULES = {
|
|||||||
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
|
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
|
||||||
'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
|
'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
|
||||||
'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
|
'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
|
||||||
r'\bmd[a-z]{0,2}-\d{2,}': md,
|
|
||||||
r'\bmmz-\d{2,}':mmz,
|
|
||||||
r'\bmsd-\d{2,}':msd,
|
|
||||||
r'\bmky-[a-z]{2,2}-\d{2,}':mky,
|
|
||||||
r'\byk-\d{2,3}': yk,
|
|
||||||
r'\bpm[a-z]?-?\d{2,}':pm,
|
|
||||||
r'\bfsog-?\d{2,}':fsog
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
# If you can't run this script, please execute the following command in PowerShell.
|
# If you can't run this script, please execute the following command in PowerShell.
|
||||||
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
||||||
|
|
||||||
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
|
# bugfix:set submodules find path
|
||||||
$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1)
|
$Env:PYTHONPATH=$pwd.path
|
||||||
$FACE_RECOGNITION_MODELS=$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1)
|
$PYTHONPATH=$pwd.path
|
||||||
|
|
||||||
mkdir build
|
mkdir build
|
||||||
mkdir __pycache__
|
mkdir __pycache__
|
||||||
|
|
||||||
pyinstaller --onefile Movie_Data_Capture.py `
|
pyinstaller --collect-submodules "scrapinglib" `
|
||||||
--hidden-import "ImageProcessing.cnn" `
|
--collect-submodules "ImageProcessing" `
|
||||||
--python-option u `
|
--collect-data "face_recognition_models" `
|
||||||
--add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" `
|
--collect-data "cloudscraper" `
|
||||||
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
|
--collect-data "opencc" `
|
||||||
--add-data "$OPENCC_PATH;opencc" `
|
|
||||||
--add-data "Img;Img" `
|
--add-data "Img;Img" `
|
||||||
--add-data "config.ini;." `
|
--add-data "config.ini;." `
|
||||||
|
--onefile Movie_Data_Capture.py
|
||||||
|
|
||||||
|
|
||||||
rmdir -Recurse -Force build
|
rmdir -Recurse -Force build
|
||||||
rmdir -Recurse -Force __pycache__
|
rmdir -Recurse -Force __pycache__
|
||||||
|
|||||||
@@ -1,3 +1,2 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .api import search, getSupportedSources
|
from .api import search, getSupportedSources
|
||||||
@@ -2,30 +2,9 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
from .parser import Parser
|
||||||
import config
|
import config
|
||||||
from .airav import Airav
|
import importlib
|
||||||
from .carib import Carib
|
|
||||||
from .dlsite import Dlsite
|
|
||||||
from .fanza import Fanza
|
|
||||||
from .gcolle import Gcolle
|
|
||||||
from .getchu import Getchu
|
|
||||||
from .jav321 import Jav321
|
|
||||||
from .javdb import Javdb
|
|
||||||
from .fc2 import Fc2
|
|
||||||
from .madou import Madou
|
|
||||||
from .mgstage import Mgstage
|
|
||||||
from .javbus import Javbus
|
|
||||||
from .xcity import Xcity
|
|
||||||
from .avsox import Avsox
|
|
||||||
from .javlibrary import Javlibrary
|
|
||||||
from .javday import Javday
|
|
||||||
from .pissplay import Pissplay
|
|
||||||
from .javmenu import Javmenu
|
|
||||||
|
|
||||||
from .tmdb import Tmdb
|
|
||||||
from .imdb import Imdb
|
|
||||||
|
|
||||||
|
|
||||||
def search(number, sources: str = None, **kwargs):
|
def search(number, sources: str = None, **kwargs):
|
||||||
""" 根据`番号/电影`名搜索信息
|
""" 根据`番号/电影`名搜索信息
|
||||||
@@ -56,32 +35,8 @@ class Scraping:
|
|||||||
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
|
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
|
||||||
'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu'
|
'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu'
|
||||||
]
|
]
|
||||||
adult_func_mapping = {
|
|
||||||
'avsox': Avsox().scrape,
|
|
||||||
'javbus': Javbus().scrape,
|
|
||||||
'xcity': Xcity().scrape,
|
|
||||||
'mgstage': Mgstage().scrape,
|
|
||||||
'madou': Madou().scrape,
|
|
||||||
'fc2': Fc2().scrape,
|
|
||||||
'dlsite': Dlsite().scrape,
|
|
||||||
'jav321': Jav321().scrape,
|
|
||||||
'fanza': Fanza().scrape,
|
|
||||||
'airav': Airav().scrape,
|
|
||||||
'carib': Carib().scrape,
|
|
||||||
'gcolle': Gcolle().scrape,
|
|
||||||
'javdb': Javdb().scrape,
|
|
||||||
'getchu': Getchu().scrape,
|
|
||||||
'javlibrary': Javlibrary().scrape,
|
|
||||||
'javday': Javday().scrape,
|
|
||||||
'pissplay': Pissplay().scrape,
|
|
||||||
'javmenu': Javmenu().scrape
|
|
||||||
}
|
|
||||||
|
|
||||||
general_full_sources = ['tmdb', 'imdb']
|
general_full_sources = ['tmdb', 'imdb']
|
||||||
general_func_mapping = {
|
|
||||||
'tmdb': Tmdb().scrape,
|
|
||||||
'imdb': Imdb().scrape,
|
|
||||||
}
|
|
||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
|
|
||||||
@@ -126,13 +81,16 @@ class Scraping:
|
|||||||
if self.debug:
|
if self.debug:
|
||||||
print('[+]select', source)
|
print('[+]select', source)
|
||||||
try:
|
try:
|
||||||
data = self.general_func_mapping[source](name, self)
|
module = importlib.import_module('.'+source,'scrapinglib')
|
||||||
|
parser_type = getattr(module, source.capitalize())
|
||||||
|
parser:Parser = parser_type()
|
||||||
|
data = parser.scrape(name,self)
|
||||||
if data == 404:
|
if data == 404:
|
||||||
continue
|
continue
|
||||||
json_data = json.loads(data)
|
json_data = json.loads(data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# print('[!] 出错啦')
|
print('[!] 出错啦')
|
||||||
# print(e)
|
print(e)
|
||||||
pass
|
pass
|
||||||
# if any service return a valid return, break
|
# if any service return a valid return, break
|
||||||
if self.get_data_state(json_data):
|
if self.get_data_state(json_data):
|
||||||
@@ -162,13 +120,16 @@ class Scraping:
|
|||||||
if self.debug:
|
if self.debug:
|
||||||
print('[+]select', source)
|
print('[+]select', source)
|
||||||
try:
|
try:
|
||||||
data = self.adult_func_mapping[source](number, self)
|
module = importlib.import_module('.'+source,'scrapinglib')
|
||||||
|
parser_type = getattr(module, source.capitalize())
|
||||||
|
parser:Parser = parser_type()
|
||||||
|
data = parser.scrape(number,self)
|
||||||
if data == 404:
|
if data == 404:
|
||||||
continue
|
continue
|
||||||
json_data = json.loads(data)
|
json_data = json.loads(data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# print('[!] 出错啦')
|
print('[!] 出错啦')
|
||||||
# print(e)
|
print(e)
|
||||||
pass
|
pass
|
||||||
# json_data = self.func_mapping[source](number, self)
|
# json_data = self.func_mapping[source](number, self)
|
||||||
# if any service return a valid return, break
|
# if any service return a valid return, break
|
||||||
@@ -216,7 +177,7 @@ class Scraping:
|
|||||||
# check sources in func_mapping
|
# check sources in func_mapping
|
||||||
todel = []
|
todel = []
|
||||||
for s in sources:
|
for s in sources:
|
||||||
if not s in self.general_func_mapping:
|
if not s in self.general_full_sources:
|
||||||
print('[!] Source Not Exist : ' + s)
|
print('[!] Source Not Exist : ' + s)
|
||||||
todel.append(s)
|
todel.append(s)
|
||||||
for d in todel:
|
for d in todel:
|
||||||
@@ -235,7 +196,7 @@ class Scraping:
|
|||||||
sources.insert(0, sources.pop(sources.index(source)))
|
sources.insert(0, sources.pop(sources.index(source)))
|
||||||
return sources
|
return sources
|
||||||
|
|
||||||
if len(sources) <= len(self.adult_func_mapping):
|
if len(sources) <= len(self.adult_full_sources):
|
||||||
# if the input file name matches certain rules,
|
# if the input file name matches certain rules,
|
||||||
# move some web service to the beginning of the list
|
# move some web service to the beginning of the list
|
||||||
lo_file_number = file_number.lower()
|
lo_file_number = file_number.lower()
|
||||||
@@ -271,7 +232,7 @@ class Scraping:
|
|||||||
# check sources in func_mapping
|
# check sources in func_mapping
|
||||||
todel = []
|
todel = []
|
||||||
for s in sources:
|
for s in sources:
|
||||||
if not s in self.adult_func_mapping and config.getInstance().debug():
|
if not s in self.adult_full_sources and config.getInstance().debug():
|
||||||
print('[!] Source Not Exist : ' + s)
|
print('[!] Source Not Exist : ' + s)
|
||||||
todel.append(s)
|
todel.append(s)
|
||||||
for d in todel:
|
for d in todel:
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ class Fc2(Parser):
|
|||||||
|
|
||||||
def extraInit(self):
|
def extraInit(self):
|
||||||
self.imagecut = 0
|
self.imagecut = 0
|
||||||
|
self.allow_number_change = True
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
|
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
|
||||||
|
|||||||
@@ -39,3 +39,8 @@ class Javday(Parser):
|
|||||||
# 删除番号和网站名
|
# 删除番号和网站名
|
||||||
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
|
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def getTags(self, htmltree) -> list:
|
||||||
|
tags = super().getTags(htmltree)
|
||||||
|
return [tag for tag in tags if 'JAVDAY.TV' not in tag]
|
||||||
|
|
||||||
@@ -6,6 +6,28 @@ from urllib.parse import urlparse, unquote
|
|||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
NUM_RULES3=[
|
||||||
|
r'(mmz{2,4})-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||||
|
r'(msd)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||||
|
r'(yk)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||||
|
r'(pm)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||||
|
r'(mky-[a-z]{2,2})-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||||
|
]
|
||||||
|
|
||||||
|
# modou提取number
|
||||||
|
def change_number(number):
|
||||||
|
number = number.lower().strip()
|
||||||
|
m = re.search(r'(md[a-z]{0,2})-?(\d{2,})(-ep\d*|-\d*)?.*', number, re.I)
|
||||||
|
if m:
|
||||||
|
return f'{m.group(1)}{m.group(2).zfill(4)}{m.group(3) or ""}'
|
||||||
|
for rules in NUM_RULES3:
|
||||||
|
m = re.search(rules, number, re.I)
|
||||||
|
if m:
|
||||||
|
return f'{m.group(1)}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||||
|
return number
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Madou(Parser):
|
class Madou(Parser):
|
||||||
source = 'madou'
|
source = 'madou'
|
||||||
|
|
||||||
@@ -14,12 +36,15 @@ class Madou(Parser):
|
|||||||
expr_studio = '//a[@rel="category tag"]/text()'
|
expr_studio = '//a[@rel="category tag"]/text()'
|
||||||
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extraInit(self):
|
def extraInit(self):
|
||||||
self.imagecut = 0
|
self.imagecut = 4
|
||||||
self.uncensored = True
|
self.uncensored = True
|
||||||
|
self.allow_number_change = True
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.lower().strip()
|
self.number = change_number(number)
|
||||||
if self.specifiedUrl:
|
if self.specifiedUrl:
|
||||||
self.detailurl = self.specifiedUrl
|
self.detailurl = self.specifiedUrl
|
||||||
else:
|
else:
|
||||||
@@ -65,5 +90,5 @@ class Madou(Parser):
|
|||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
studio = self.getStudio(htmltree)
|
studio = self.getStudio(htmltree)
|
||||||
x = super().getTags(htmltree)
|
tags = super().getTags(htmltree)
|
||||||
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
|
return [tag for tag in tags if studio not in tag and '麻豆' not in tag]
|
||||||
|
|||||||
@@ -210,6 +210,13 @@ class Parser:
|
|||||||
|
|
||||||
def getTags(self, htmltree) -> list:
|
def getTags(self, htmltree) -> list:
|
||||||
alls = self.getTreeAll(htmltree, self.expr_tags)
|
alls = self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
tags = []
|
||||||
|
for t in alls:
|
||||||
|
for tag in t.strip().split(','):
|
||||||
|
tag = tag.strip()
|
||||||
|
if tag:
|
||||||
|
tags.append(tag)
|
||||||
|
return tags
|
||||||
return [ x.strip() for x in alls if x.strip()]
|
return [ x.strip() for x in alls if x.strip()]
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
|
|||||||
Reference in New Issue
Block a user