@@ -351,6 +351,8 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
|
|||||||
for full_name in source.glob(r'**/*'):
|
for full_name in source.glob(r'**/*'):
|
||||||
if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
|
if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
|
||||||
continue
|
continue
|
||||||
|
if not full_name.is_file():
|
||||||
|
continue
|
||||||
if not full_name.suffix.lower() in file_type:
|
if not full_name.suffix.lower() in file_type:
|
||||||
continue
|
continue
|
||||||
absf = str(full_name)
|
absf = str(full_name)
|
||||||
@@ -681,7 +683,7 @@ def period(delta, pattern):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
version = '6.6.5'
|
version = '6.6.6'
|
||||||
urllib3.disable_warnings() # Ignore http proxy warning
|
urllib3.disable_warnings() # Ignore http proxy warning
|
||||||
app_start = time.time()
|
app_start = time.time()
|
||||||
|
|
||||||
|
|||||||
@@ -129,7 +129,7 @@ mode = 1
|
|||||||
vars = outline,series,studio,tag,title
|
vars = outline,series,studio,tag,title
|
||||||
|
|
||||||
[javdb]
|
[javdb]
|
||||||
sites = 38,39,40
|
sites = 521
|
||||||
|
|
||||||
; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确,需要GPU/CUDA,速度慢)
|
; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确,需要GPU/CUDA,速度慢)
|
||||||
; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分
|
; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ numpy
|
|||||||
face-recognition-models
|
face-recognition-models
|
||||||
lxml
|
lxml
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
pillow
|
pillow==9.5.0
|
||||||
cloudscraper
|
cloudscraper
|
||||||
pysocks==1.7.1
|
pysocks==1.7.1
|
||||||
urllib3==1.25.11
|
urllib3==1.25.11
|
||||||
|
|||||||
@@ -220,9 +220,7 @@ class Scraping:
|
|||||||
elif "pcolle" in sources and "pcolle" in lo_file_number:
|
elif "pcolle" in sources and "pcolle" in lo_file_number:
|
||||||
sources = ["pcolle"]
|
sources = ["pcolle"]
|
||||||
elif "fc2" in lo_file_number:
|
elif "fc2" in lo_file_number:
|
||||||
if "fc2" in sources:
|
sources = ["fc2", "msin"]
|
||||||
sources = insert(sources, "msin")
|
|
||||||
sources = insert(sources, "fc2")
|
|
||||||
elif "mgstage" in sources and \
|
elif "mgstage" in sources and \
|
||||||
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
||||||
sources = insert(sources, "mgstage")
|
sources = insert(sources, "mgstage")
|
||||||
@@ -245,7 +243,7 @@ class Scraping:
|
|||||||
todel.append(s)
|
todel.append(s)
|
||||||
for d in todel:
|
for d in todel:
|
||||||
if config.getInstance().debug():
|
if config.getInstance().debug():
|
||||||
print('[!] Remove Source : ' + s)
|
print('[!] Remove Source : ' + d)
|
||||||
sources.remove(d)
|
sources.remove(d)
|
||||||
return sources
|
return sources
|
||||||
|
|
||||||
|
|||||||
@@ -97,3 +97,8 @@ class Dlsite(Parser):
|
|||||||
except:
|
except:
|
||||||
result = ''
|
result = ''
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
tags = super().getTags(htmltree)
|
||||||
|
tags.append("DLsite")
|
||||||
|
return tags
|
||||||
|
|||||||
@@ -109,9 +109,15 @@ class wwwGetchu(Parser):
|
|||||||
def extradict(self, dic: dict):
|
def extradict(self, dic: dict):
|
||||||
""" 额外新增的 headers
|
""" 额外新增的 headers
|
||||||
"""
|
"""
|
||||||
dic['headers'] = {'referer': self.detailurl}
|
dic['headers'] = {'referer': self.detailurl}
|
||||||
return dic
|
return dic
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
tags = super().getTags(htmltree)
|
||||||
|
tags.append("Getchu")
|
||||||
|
return tags
|
||||||
|
|
||||||
|
|
||||||
class dlGetchu(wwwGetchu):
|
class dlGetchu(wwwGetchu):
|
||||||
""" 二者基本一致
|
""" 二者基本一致
|
||||||
headers extrafanart 略有区别
|
headers extrafanart 略有区别
|
||||||
@@ -154,7 +160,7 @@ class dlGetchu(wwwGetchu):
|
|||||||
|
|
||||||
def extradict(self, dic: dict):
|
def extradict(self, dic: dict):
|
||||||
return dic
|
return dic
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
|
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
extrafanart = []
|
extrafanart = []
|
||||||
@@ -162,3 +168,8 @@ class dlGetchu(wwwGetchu):
|
|||||||
i = "https://dl.getchu.com" + i
|
i = "https://dl.getchu.com" + i
|
||||||
extrafanart.append(i)
|
extrafanart.append(i)
|
||||||
return extrafanart
|
return extrafanart
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
tags = super().getTags(htmltree)
|
||||||
|
tags.append("Getchu")
|
||||||
|
return tags
|
||||||
|
|||||||
@@ -219,7 +219,6 @@ class Parser:
|
|||||||
if tag:
|
if tag:
|
||||||
tags.append(tag)
|
tags.append(tag)
|
||||||
return tags
|
return tags
|
||||||
return [ x.strip() for x in alls if x.strip()]
|
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2)
|
return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2)
|
||||||
|
|||||||
58
scrapinglib/pcolle.py
Normal file
58
scrapinglib/pcolle.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from .httprequest import request_session
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Pcolle(Parser):
|
||||||
|
source = 'pcolle'
|
||||||
|
|
||||||
|
expr_number = '//th[contains(text(),"商品ID")]/../td/text()'
|
||||||
|
expr_title = '//div[@class="title-04"]/div/text()'
|
||||||
|
expr_studio = '//th[contains(text(),"販売会員")]/../td/a/text()'
|
||||||
|
expr_director = '//th[contains(text(),"販売会員")]/../td/a/text()'
|
||||||
|
expr_actor = '//th[contains(text(),"販売会員")]/../td/a/text()'
|
||||||
|
expr_label = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
|
||||||
|
expr_series = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
|
||||||
|
expr_release = '//th[contains(text(),"販売開始日")]/../td/text()'
|
||||||
|
expr_cover = '/html/body/div[1]/div/div[4]/div[2]/div/div[1]/div/article/a/img/@src'
|
||||||
|
expr_tags = '//p[contains(text(),"商品タグ")]/../ul/li/a/text()'
|
||||||
|
expr_outline = '//p[@class="fo-14"]/text()'
|
||||||
|
expr_extrafanart = '//*[@class="item-nav"]/ul/li/a/img/@src'
|
||||||
|
|
||||||
|
# expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
|
||||||
|
|
||||||
|
def extraInit(self):
|
||||||
|
self.imagecut = 4
|
||||||
|
|
||||||
|
def search(self, number: str):
|
||||||
|
self.number = number.upper().replace('PCOLLE-', '')
|
||||||
|
self.detailurl = 'https://www.pcolle.com/product/detail/?product_id=' + self.number
|
||||||
|
session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
htmlcode = session.get(self.detailurl).text
|
||||||
|
htmltree = etree.HTML(htmlcode)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
num = super().getNum(htmltree).upper()
|
||||||
|
if self.number != num:
|
||||||
|
raise Exception(f'[!] {self.number}: find [{num}] in pcolle, not match')
|
||||||
|
return "PCOLLE-" + str(num)
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
result = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
|
try:
|
||||||
|
return "\n".join(result)
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return super().getRelease(htmltree).replace('年', '-').replace('月', '-').replace('日', '')
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
if ".gif" in super().getCover(htmltree) and len(super().getExtrafanart(htmltree)) != 0:
|
||||||
|
return super().getExtrafanart(htmltree)[0]
|
||||||
|
return super().getCover(htmltree)
|
||||||
Reference in New Issue
Block a user