Merge pull request #1 from yoshiko2/master

将GETCHU设置指定编码(euc-jp)
This commit is contained in:
Trance233
2023-07-09 17:04:19 +08:00
committed by GitHub
8 changed files with 83 additions and 10 deletions

View File

@@ -220,9 +220,7 @@ class Scraping:
elif "pcolle" in sources and "pcolle" in lo_file_number:
sources = ["pcolle"]
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources, "msin")
sources = insert(sources, "fc2")
sources = ["fc2", "msin"]
elif "mgstage" in sources and \
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources, "mgstage")
@@ -245,7 +243,7 @@ class Scraping:
todel.append(s)
for d in todel:
if config.getInstance().debug():
print('[!] Remove Source : ' + s)
print('[!] Remove Source : ' + d)
sources.remove(d)
return sources

View File

@@ -97,3 +97,8 @@ class Dlsite(Parser):
except:
result = ''
return result
def getTags(self, htmltree):
tags = super().getTags(htmltree)
tags.append("DLsite")
return tags

View File

@@ -109,9 +109,15 @@ class wwwGetchu(Parser):
def extradict(self, dic: dict):
""" 额外新增的 headers
"""
dic['headers'] = {'referer': self.detailurl}
dic['headers'] = {'referer': self.detailurl}
return dic
def getTags(self, htmltree):
tags = super().getTags(htmltree)
tags.append("Getchu")
return tags
class dlGetchu(wwwGetchu):
""" 二者基本一致
headers extrafanart 略有区别
@@ -154,7 +160,7 @@ class dlGetchu(wwwGetchu):
def extradict(self, dic: dict):
return dic
def getExtrafanart(self, htmltree):
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
extrafanart = []
@@ -162,3 +168,8 @@ class dlGetchu(wwwGetchu):
i = "https://dl.getchu.com" + i
extrafanart.append(i)
return extrafanart
def getTags(self, htmltree):
tags = super().getTags(htmltree)
tags.append("Getchu")
return tags

View File

@@ -219,7 +219,6 @@ class Parser:
if tag:
tags.append(tag)
return tags
return [ x.strip() for x in alls if x.strip()]
def getStudio(self, htmltree):
return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2)

58
scrapinglib/pcolle.py Normal file
View File

@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .httprequest import request_session
from .parser import Parser
class Pcolle(Parser):
source = 'pcolle'
expr_number = '//th[contains(text(),"商品ID")]/../td/text()'
expr_title = '//div[@class="title-04"]/div/text()'
expr_studio = '//th[contains(text(),"販売会員")]/../td/a/text()'
expr_director = '//th[contains(text(),"販売会員")]/../td/a/text()'
expr_actor = '//th[contains(text(),"販売会員")]/../td/a/text()'
expr_label = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
expr_series = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
expr_release = '//th[contains(text(),"販売開始日")]/../td/text()'
expr_cover = '/html/body/div[1]/div/div[4]/div[2]/div/div[1]/div/article/a/img/@src'
expr_tags = '//p[contains(text(),"商品タグ")]/../ul/li/a/text()'
expr_outline = '//p[@class="fo-14"]/text()'
expr_extrafanart = '//*[@class="item-nav"]/ul/li/a/img/@src'
# expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def extraInit(self):
self.imagecut = 4
def search(self, number: str):
self.number = number.upper().replace('PCOLLE-', '')
self.detailurl = 'https://www.pcolle.com/product/detail/?product_id=' + self.number
session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get(self.detailurl).text
htmltree = etree.HTML(htmlcode)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
num = super().getNum(htmltree).upper()
if self.number != num:
raise Exception(f'[!] {self.number}: find [{num}] in pcolle, not match')
return "PCOLLE-" + str(num)
def getOutline(self, htmltree):
result = self.getTreeAll(htmltree, self.expr_outline)
try:
return "\n".join(result)
except:
return ""
def getRelease(self, htmltree):
return super().getRelease(htmltree).replace('', '-').replace('', '-').replace('', '')
def getCover(self, htmltree):
if ".gif" in super().getCover(htmltree) and len(super().getExtrafanart(htmltree)) != 0:
return super().getExtrafanart(htmltree)[0]
return super().getCover(htmltree)