1. 动态加载爬虫

2. 修复pyinstaller路径查找子包问题
3. madou的番号处理移动到爬虫内部
4. 过滤javday中多余的tag
This commit is contained in:
hejianjun
2023-03-27 15:37:00 +08:00
parent 24e8b75dab
commit 1d46a70eed
9 changed files with 89 additions and 125 deletions

View File

@@ -57,7 +57,7 @@ image_naming_with_number = 0
update_check = 1
[priority]
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle
website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle,javday,javmenu
[escape]
literals = \()/

View File

@@ -59,7 +59,7 @@ def get_number(debug: bool, file_path: str) -> str:
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况例如n1012-CD1.wmv
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
file_number = os.path.splitext(filename)
filename = re.search(r'\w+(-|_)\w+', filename, re.A)
filename = re.search(r'[\w\-_]+', filename, re.A)
if filename:
file_number = str(filename.group())
else:
@@ -85,34 +85,7 @@ def get_number(debug: bool, file_path: str) -> str:
print(f'[-]Number Parser exception: {e} [{file_path}]')
return None
# modou提取number
def md(filename):
m = re.search(r'(md[a-z]{0,2}-?)(\d{2,})(-ep\d*|-\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(4)}{m.group(3) or ""}'
def mmz(filename):
m = re.search(r'(mmz-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
def msd(filename):
m = re.search(r'(msd-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
def mky(filename):
m = re.search(r'(mky-[a-z]{2,2}-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
def yk(filename):
m = re.search(r'(yk-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
def pm(filename):
m = re.search(r'(pm[a-z]?-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
def fsog(filename):
m = re.search(r'(fsog-?)(\d{2,})(-ep\d*)*', filename, re.I)
return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}'
# 按javdb数据源的命名规范提取number
G_TAKE_NUM_RULES = {
@@ -126,13 +99,6 @@ G_TAKE_NUM_RULES = {
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
r'\bmd[a-z]{0,2}-\d{2,}': md,
r'\bmmz-\d{2,}':mmz,
r'\bmsd-\d{2,}':msd,
r'\bmky-[a-z]{2,2}-\d{2,}':mky,
r'\byk-\d{2,3}': yk,
r'\bpm[a-z]?-?\d{2,}':pm,
r'\bfsog-?\d{2,}':fsog
}

View File

@@ -1,21 +1,21 @@
# If you can't run this script, please execute the following command in PowerShell.
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1)
$FACE_RECOGNITION_MODELS=$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1)
# bugfixset submodules find path
$Env:PYTHONPATH=$pwd.path
$PYTHONPATH=$pwd.path
mkdir build
mkdir __pycache__
pyinstaller --onefile Movie_Data_Capture.py `
--hidden-import "ImageProcessing.cnn" `
--python-option u `
--add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "$OPENCC_PATH;opencc" `
pyinstaller --collect-submodules "scrapinglib" `
--collect-submodules "ImageProcessing" `
--collect-data "face_recognition_models" `
--collect-data "cloudscraper" `
--collect-data "opencc" `
--add-data "Img;Img" `
--add-data "config.ini;." `
--onefile Movie_Data_Capture.py
rmdir -Recurse -Force build
rmdir -Recurse -Force __pycache__

View File

@@ -1,3 +1,2 @@
# -*- coding: utf-8 -*-
from .api import search, getSupportedSources

View File

@@ -2,30 +2,9 @@
import re
import json
from .parser import Parser
import config
from .airav import Airav
from .carib import Carib
from .dlsite import Dlsite
from .fanza import Fanza
from .gcolle import Gcolle
from .getchu import Getchu
from .jav321 import Jav321
from .javdb import Javdb
from .fc2 import Fc2
from .madou import Madou
from .mgstage import Mgstage
from .javbus import Javbus
from .xcity import Xcity
from .avsox import Avsox
from .javlibrary import Javlibrary
from .javday import Javday
from .pissplay import Pissplay
from .javmenu import Javmenu
from .tmdb import Tmdb
from .imdb import Imdb
import importlib
def search(number, sources: str = None, **kwargs):
""" 根据`番号/电影`名搜索信息
@@ -54,34 +33,10 @@ class Scraping:
"""
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
'getchu', 'gcolle','javday','pissplay','javmenu'
'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu'
]
adult_func_mapping = {
'avsox': Avsox().scrape,
'javbus': Javbus().scrape,
'xcity': Xcity().scrape,
'mgstage': Mgstage().scrape,
'madou': Madou().scrape,
'fc2': Fc2().scrape,
'dlsite': Dlsite().scrape,
'jav321': Jav321().scrape,
'fanza': Fanza().scrape,
'airav': Airav().scrape,
'carib': Carib().scrape,
'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape,
'getchu': Getchu().scrape,
'javlibrary': Javlibrary().scrape,
'javday': Javday().scrape,
'pissplay': Pissplay().scrape,
'javmenu': Javmenu().scrape
}
general_full_sources = ['tmdb', 'imdb']
general_func_mapping = {
'tmdb': Tmdb().scrape,
'imdb': Imdb().scrape,
}
debug = False
@@ -126,13 +81,16 @@ class Scraping:
if self.debug:
print('[+]select', source)
try:
data = self.general_func_mapping[source](name, self)
module = importlib.import_module('.'+source,'scrapinglib')
parser_type = getattr(module, source.capitalize())
parser:Parser = parser_type()
data = parser.scrape(name,self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
# print('[!] 出错啦')
# print(e)
print('[!] 出错啦')
print(e)
pass
# if any service return a valid return, break
if self.get_data_state(json_data):
@@ -162,13 +120,16 @@ class Scraping:
if self.debug:
print('[+]select', source)
try:
data = self.adult_func_mapping[source](number, self)
module = importlib.import_module('.'+source,'scrapinglib')
parser_type = getattr(module, source.capitalize())
parser:Parser = parser_type()
data = parser.scrape(number,self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
# print('[!] 出错啦')
# print(e)
print('[!] 出错啦')
print(e)
pass
# json_data = self.func_mapping[source](number, self)
# if any service return a valid return, break
@@ -216,7 +177,7 @@ class Scraping:
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.general_func_mapping:
if not s in self.general_full_sources:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
@@ -235,7 +196,7 @@ class Scraping:
sources.insert(0, sources.pop(sources.index(source)))
return sources
if len(sources) <= len(self.adult_func_mapping):
if len(sources) <= len(self.adult_full_sources):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
@@ -271,7 +232,7 @@ class Scraping:
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.adult_func_mapping and config.getInstance().debug():
if not s in self.adult_full_sources and config.getInstance().debug():
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:

View File

@@ -22,6 +22,7 @@ class Fc2(Parser):
def extraInit(self):
self.imagecut = 0
self.allow_number_change = True
def search(self, number):
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')

View File

@@ -39,3 +39,8 @@ class Javday(Parser):
# 删除番号和网站名
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
return result
def getTags(self, htmltree) -> list:
tags = super().getTags(htmltree)
return [tag for tag in tags if 'JAVDAY.TV' not in tag]

View File

@@ -6,6 +6,28 @@ from urllib.parse import urlparse, unquote
from .parser import Parser
NUM_RULES3=[
r'(mmz{2,4})-?(\d{2,})(-ep\d*|-\d*)?.*',
r'(msd)-?(\d{2,})(-ep\d*|-\d*)?.*',
r'(yk)-?(\d{2,})(-ep\d*|-\d*)?.*',
r'(pm)-?(\d{2,})(-ep\d*|-\d*)?.*',
r'(mky-[a-z]{2,2})-?(\d{2,})(-ep\d*|-\d*)?.*',
]
# modou提取number
def change_number(number):
number = number.lower().strip()
m = re.search(r'(md[a-z]{0,2})-?(\d{2,})(-ep\d*|-\d*)?.*', number, re.I)
if m:
return f'{m.group(1)}{m.group(2).zfill(4)}{m.group(3) or ""}'
for rules in NUM_RULES3:
m = re.search(rules, number, re.I)
if m:
return f'{m.group(1)}{m.group(2).zfill(3)}{m.group(3) or ""}'
return number
class Madou(Parser):
source = 'madou'
@@ -14,12 +36,15 @@ class Madou(Parser):
expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
def extraInit(self):
self.imagecut = 0
self.imagecut = 4
self.uncensored = True
self.allow_number_change = True
def search(self, number):
self.number = number.lower().strip()
self.number = change_number(number)
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
@@ -65,5 +90,5 @@ class Madou(Parser):
def getTags(self, htmltree):
studio = self.getStudio(htmltree)
x = super().getTags(htmltree)
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
tags = super().getTags(htmltree)
return [tag for tag in tags if studio not in tag and '麻豆' not in tag]

View File

@@ -210,6 +210,13 @@ class Parser:
def getTags(self, htmltree) -> list:
alls = self.getTreeAll(htmltree, self.expr_tags)
tags = []
for t in alls:
for tag in t.strip().split(','):
tag = tag.strip()
if tag:
tags.append(tag)
return tags
return [ x.strip() for x in alls if x.strip()]
def getStudio(self, htmltree):