Merge pull request #993 from hejianjun/feature/自定义爬虫番号处理
Feature/自定义爬虫番号处理
This commit is contained in:
@@ -57,7 +57,7 @@ image_naming_with_number = 0
|
||||
update_check = 1
|
||||
|
||||
[priority]
|
||||
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle
|
||||
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle,javday,javmenu
|
||||
|
||||
[escape]
|
||||
literals = \()/
|
||||
|
||||
@@ -59,7 +59,7 @@ def get_number(debug: bool, file_path: str) -> str:
|
||||
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv
|
||||
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
|
||||
file_number = os.path.splitext(filename)
|
||||
filename = re.search(r'\w+(-|_)\w+', filename, re.A)
|
||||
filename = re.search(r'[\w\-_]+', filename, re.A)
|
||||
if filename:
|
||||
file_number = str(filename.group())
|
||||
else:
|
||||
@@ -85,34 +85,7 @@ def get_number(debug: bool, file_path: str) -> str:
|
||||
print(f'[-]Number Parser exception: {e} [{file_path}]')
|
||||
return None
|
||||
|
||||
# modou提取number
|
||||
def md(filename):
|
||||
m = re.search(r'(md[a-z]{0,2}-?)(\d{2,})(-ep\d*|-\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(4)}{m.group(3) or ""}'
|
||||
|
||||
def mmz(filename):
|
||||
m = re.search(r'(mmz-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
def msd(filename):
|
||||
m = re.search(r'(msd-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
def mky(filename):
|
||||
m = re.search(r'(mky-[a-z]{2,2}-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
def yk(filename):
|
||||
m = re.search(r'(yk-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
def pm(filename):
|
||||
m = re.search(r'(pm[a-z]?-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
def fsog(filename):
|
||||
m = re.search(r'(fsog-?)(\d{2,})(-ep\d*)*', filename, re.I)
|
||||
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
|
||||
# 按javdb数据源的命名规范提取number
|
||||
G_TAKE_NUM_RULES = {
|
||||
@@ -126,13 +99,6 @@ G_TAKE_NUM_RULES = {
|
||||
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
|
||||
'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
|
||||
'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
|
||||
r'\bmd[a-z]{0,2}-\d{2,}': md,
|
||||
r'\bmmz-\d{2,}':mmz,
|
||||
r'\bmsd-\d{2,}':msd,
|
||||
r'\bmky-[a-z]{2,2}-\d{2,}':mky,
|
||||
r'\byk-\d{2,3}': yk,
|
||||
r'\bpm[a-z]?-?\d{2,}':pm,
|
||||
r'\bfsog-?\d{2,}':fsog
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
# If you can't run this script, please execute the following command in PowerShell.
|
||||
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
||||
|
||||
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
|
||||
$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1)
|
||||
$FACE_RECOGNITION_MODELS=$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1)
|
||||
|
||||
mkdir build
|
||||
mkdir __pycache__
|
||||
|
||||
pyinstaller --onefile Movie_Data_Capture.py `
|
||||
--hidden-import "ImageProcessing.cnn" `
|
||||
--python-option u `
|
||||
--add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" `
|
||||
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
|
||||
--add-data "$OPENCC_PATH;opencc" `
|
||||
--add-data "Img;Img" `
|
||||
--add-data "config.ini;." `
|
||||
|
||||
rmdir -Recurse -Force build
|
||||
rmdir -Recurse -Force __pycache__
|
||||
rmdir -Recurse -Force Movie_Data_Capture.spec
|
||||
|
||||
echo "[Make]Finish"
|
||||
pause
|
||||
# If you can't run this script, please execute the following command in PowerShell.
|
||||
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
||||
|
||||
# bugfix:set submodules find path
|
||||
$Env:PYTHONPATH=$pwd.path
|
||||
$PYTHONPATH=$pwd.path
|
||||
mkdir build
|
||||
mkdir __pycache__
|
||||
|
||||
pyinstaller --collect-submodules "scrapinglib" `
|
||||
--collect-submodules "ImageProcessing" `
|
||||
--collect-data "face_recognition_models" `
|
||||
--collect-data "cloudscraper" `
|
||||
--collect-data "opencc" `
|
||||
--add-data "Img;Img" `
|
||||
--add-data "config.ini;." `
|
||||
--onefile Movie_Data_Capture.py
|
||||
|
||||
|
||||
rmdir -Recurse -Force build
|
||||
rmdir -Recurse -Force __pycache__
|
||||
rmdir -Recurse -Force Movie_Data_Capture.spec
|
||||
|
||||
echo "[Make]Finish"
|
||||
pause
|
||||
|
||||
@@ -1,3 +1,2 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .api import search, getSupportedSources
|
||||
from .api import search, getSupportedSources
|
||||
@@ -2,30 +2,9 @@
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
from .parser import Parser
|
||||
import config
|
||||
from .airav import Airav
|
||||
from .carib import Carib
|
||||
from .dlsite import Dlsite
|
||||
from .fanza import Fanza
|
||||
from .gcolle import Gcolle
|
||||
from .getchu import Getchu
|
||||
from .jav321 import Jav321
|
||||
from .javdb import Javdb
|
||||
from .fc2 import Fc2
|
||||
from .madou import Madou
|
||||
from .mgstage import Mgstage
|
||||
from .javbus import Javbus
|
||||
from .xcity import Xcity
|
||||
from .avsox import Avsox
|
||||
from .javlibrary import Javlibrary
|
||||
from .javday import Javday
|
||||
from .pissplay import Pissplay
|
||||
from .javmenu import Javmenu
|
||||
|
||||
from .tmdb import Tmdb
|
||||
from .imdb import Imdb
|
||||
|
||||
import importlib
|
||||
|
||||
def search(number, sources: str = None, **kwargs):
|
||||
""" 根据`番号/电影`名搜索信息
|
||||
@@ -53,35 +32,11 @@ class Scraping:
|
||||
"""
|
||||
"""
|
||||
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
|
||||
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
|
||||
'getchu', 'gcolle','javday','pissplay','javmenu'
|
||||
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
|
||||
'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu'
|
||||
]
|
||||
adult_func_mapping = {
|
||||
'avsox': Avsox().scrape,
|
||||
'javbus': Javbus().scrape,
|
||||
'xcity': Xcity().scrape,
|
||||
'mgstage': Mgstage().scrape,
|
||||
'madou': Madou().scrape,
|
||||
'fc2': Fc2().scrape,
|
||||
'dlsite': Dlsite().scrape,
|
||||
'jav321': Jav321().scrape,
|
||||
'fanza': Fanza().scrape,
|
||||
'airav': Airav().scrape,
|
||||
'carib': Carib().scrape,
|
||||
'gcolle': Gcolle().scrape,
|
||||
'javdb': Javdb().scrape,
|
||||
'getchu': Getchu().scrape,
|
||||
'javlibrary': Javlibrary().scrape,
|
||||
'javday': Javday().scrape,
|
||||
'pissplay': Pissplay().scrape,
|
||||
'javmenu': Javmenu().scrape
|
||||
}
|
||||
|
||||
general_full_sources = ['tmdb', 'imdb']
|
||||
general_func_mapping = {
|
||||
'tmdb': Tmdb().scrape,
|
||||
'imdb': Imdb().scrape,
|
||||
}
|
||||
|
||||
debug = False
|
||||
|
||||
@@ -126,13 +81,16 @@ class Scraping:
|
||||
if self.debug:
|
||||
print('[+]select', source)
|
||||
try:
|
||||
data = self.general_func_mapping[source](name, self)
|
||||
module = importlib.import_module('.'+source,'scrapinglib')
|
||||
parser_type = getattr(module, source.capitalize())
|
||||
parser:Parser = parser_type()
|
||||
data = parser.scrape(name,self)
|
||||
if data == 404:
|
||||
continue
|
||||
json_data = json.loads(data)
|
||||
except Exception as e:
|
||||
# print('[!] 出错啦')
|
||||
# print(e)
|
||||
print('[!] 出错啦')
|
||||
print(e)
|
||||
pass
|
||||
# if any service return a valid return, break
|
||||
if self.get_data_state(json_data):
|
||||
@@ -162,13 +120,16 @@ class Scraping:
|
||||
if self.debug:
|
||||
print('[+]select', source)
|
||||
try:
|
||||
data = self.adult_func_mapping[source](number, self)
|
||||
module = importlib.import_module('.'+source,'scrapinglib')
|
||||
parser_type = getattr(module, source.capitalize())
|
||||
parser:Parser = parser_type()
|
||||
data = parser.scrape(number,self)
|
||||
if data == 404:
|
||||
continue
|
||||
json_data = json.loads(data)
|
||||
except Exception as e:
|
||||
# print('[!] 出错啦')
|
||||
# print(e)
|
||||
print('[!] 出错啦')
|
||||
print(e)
|
||||
pass
|
||||
# json_data = self.func_mapping[source](number, self)
|
||||
# if any service return a valid return, break
|
||||
@@ -216,7 +177,7 @@ class Scraping:
|
||||
# check sources in func_mapping
|
||||
todel = []
|
||||
for s in sources:
|
||||
if not s in self.general_func_mapping:
|
||||
if not s in self.general_full_sources:
|
||||
print('[!] Source Not Exist : ' + s)
|
||||
todel.append(s)
|
||||
for d in todel:
|
||||
@@ -235,7 +196,7 @@ class Scraping:
|
||||
sources.insert(0, sources.pop(sources.index(source)))
|
||||
return sources
|
||||
|
||||
if len(sources) <= len(self.adult_func_mapping):
|
||||
if len(sources) <= len(self.adult_full_sources):
|
||||
# if the input file name matches certain rules,
|
||||
# move some web service to the beginning of the list
|
||||
lo_file_number = file_number.lower()
|
||||
@@ -271,7 +232,7 @@ class Scraping:
|
||||
# check sources in func_mapping
|
||||
todel = []
|
||||
for s in sources:
|
||||
if not s in self.adult_func_mapping and config.getInstance().debug():
|
||||
if not s in self.adult_full_sources and config.getInstance().debug():
|
||||
print('[!] Source Not Exist : ' + s)
|
||||
todel.append(s)
|
||||
for d in todel:
|
||||
|
||||
@@ -22,6 +22,7 @@ class Fc2(Parser):
|
||||
|
||||
def extraInit(self):
|
||||
self.imagecut = 0
|
||||
self.allow_number_change = True
|
||||
|
||||
def search(self, number):
|
||||
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
|
||||
|
||||
@@ -39,3 +39,8 @@ class Javday(Parser):
|
||||
# 删除番号和网站名
|
||||
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
|
||||
return result
|
||||
|
||||
def getTags(self, htmltree) -> list:
|
||||
tags = super().getTags(htmltree)
|
||||
return [tag for tag in tags if 'JAVDAY.TV' not in tag]
|
||||
|
||||
@@ -6,6 +6,28 @@ from urllib.parse import urlparse, unquote
|
||||
from .parser import Parser
|
||||
|
||||
|
||||
NUM_RULES3=[
|
||||
r'(mmz{2,4})-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||
r'(msd)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||
r'(yk)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||
r'(pm)-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||
r'(mky-[a-z]{2,2})-?(\d{2,})(-ep\d*|-\d*)?.*',
|
||||
]
|
||||
|
||||
# modou提取number
|
||||
def change_number(number):
|
||||
number = number.lower().strip()
|
||||
m = re.search(r'(md[a-z]{0,2})-?(\d{2,})(-ep\d*|-\d*)?.*', number, re.I)
|
||||
if m:
|
||||
return f'{m.group(1)}{m.group(2).zfill(4)}{m.group(3) or ""}'
|
||||
for rules in NUM_RULES3:
|
||||
m = re.search(rules, number, re.I)
|
||||
if m:
|
||||
return f'{m.group(1)}{m.group(2).zfill(3)}{m.group(3) or ""}'
|
||||
return number
|
||||
|
||||
|
||||
|
||||
class Madou(Parser):
|
||||
source = 'madou'
|
||||
|
||||
@@ -14,12 +36,15 @@ class Madou(Parser):
|
||||
expr_studio = '//a[@rel="category tag"]/text()'
|
||||
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||
|
||||
|
||||
|
||||
def extraInit(self):
|
||||
self.imagecut = 0
|
||||
self.imagecut = 4
|
||||
self.uncensored = True
|
||||
self.allow_number_change = True
|
||||
|
||||
def search(self, number):
|
||||
self.number = number.lower().strip()
|
||||
self.number = change_number(number)
|
||||
if self.specifiedUrl:
|
||||
self.detailurl = self.specifiedUrl
|
||||
else:
|
||||
@@ -65,5 +90,5 @@ class Madou(Parser):
|
||||
|
||||
def getTags(self, htmltree):
|
||||
studio = self.getStudio(htmltree)
|
||||
x = super().getTags(htmltree)
|
||||
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
|
||||
tags = super().getTags(htmltree)
|
||||
return [tag for tag in tags if studio not in tag and '麻豆' not in tag]
|
||||
|
||||
@@ -210,6 +210,13 @@ class Parser:
|
||||
|
||||
def getTags(self, htmltree) -> list:
|
||||
alls = self.getTreeAll(htmltree, self.expr_tags)
|
||||
tags = []
|
||||
for t in alls:
|
||||
for tag in t.strip().split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
tags.append(tag)
|
||||
return tags
|
||||
return [ x.strip() for x in alls if x.strip()]
|
||||
|
||||
def getStudio(self, htmltree):
|
||||
|
||||
Reference in New Issue
Block a user