update scrapinglib

This commit is contained in:
Mathhew
2022-05-27 15:24:29 +08:00
parent d6d0a1687b
commit 9898f2918f
16 changed files with 213 additions and 73 deletions

View File

@@ -18,13 +18,11 @@ class Airav(Parser):
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
self.detailurl = 'https://cn.airav.wiki/video/' + number
engine = Javbus()
javbusinfo = engine.search(number, core)
javbusinfo = engine.search(number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:

View File

@@ -8,6 +8,7 @@ from scrapinglib.carib import Carib
from scrapinglib.dlsite import Dlsite
from scrapinglib.fanza import Fanza
from scrapinglib.gcolle import Gcolle
from scrapinglib.getchu import Getchu
from scrapinglib.jav321 import Jav321
from scrapinglib.javdb import Javdb
from scrapinglib.mv91 import Mv91
@@ -19,13 +20,15 @@ from .xcity import Xcity
from .avsox import Avsox
def search(number, souces=None, proxies=None, dbcookies=None):
def search(number, souces=None, proxies=None, verify=None, dbcookies=None, dbsite=None, morestoryline=True):
"""
TODO 支持更多网站 douban, imdb,tmdb anidb等
type 区分 r18 与 normal
"""
sc = Scraping()
return sc.search(number, souces, proxies=proxies, dbcookies=dbcookies)
return sc.search(number, souces, proxies=proxies, verify=verify,
dbcookies=dbcookies, dbsite=dbsite,
morestoryline=morestoryline)
class Scraping():
@@ -54,30 +57,39 @@ class Scraping():
full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
'gcolle', 'javdb']
'gcolle', 'javdb', 'getchu']
func_mapping = {
'avsox': Avsox().search,
'javbus': Javbus().search,
'xcity': Xcity().search,
'mgstage': Mgstage().search,
'madou': Madou().search,
'fc2': Fc2().search,
'dlsite': Dlsite().search,
'jav321': Jav321().search,
'fanza': Fanza().search,
'airav': Airav().search,
'carib': Carib().search,
'mv91': Mv91().search,
'gcolle': Gcolle().search,
'javdb': Javdb().search,
'avsox': Avsox().scrape,
'javbus': Javbus().scrape,
'xcity': Xcity().scrape,
'mgstage': Mgstage().scrape,
'madou': Madou().scrape,
'fc2': Fc2().scrape,
'dlsite': Dlsite().scrape,
'jav321': Jav321().scrape,
'fanza': Fanza().scrape,
'airav': Airav().scrape,
'carib': Carib().scrape,
'mv91': Mv91().scrape,
'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape,
'getchu': Getchu().scrape,
}
proxies = None
verify = None
dbcookies = None
dbsite = None
# 使用storyline方法进一步获取故事情节
morestoryline = True
def search(self, number, sources=None, proxies=None, dbcookies=None):
def search(self, number, sources=None, proxies=None, verify=None,
dbcookies=None, dbsite=None, morestoryline=True):
self.proxies = proxies
self.verify = verify
self.dbcookies = dbcookies
self.dbsite = dbsite
self.morestoryline = morestoryline
sources = self.checkSources(sources, number)
json_data = {}

View File

@@ -57,8 +57,10 @@ class Avsox(Parser):
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number)
return ''
def getActors(self, htmltree):
a = super().getActors(htmltree)

View File

@@ -20,10 +20,8 @@ class Carib(Parser):
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
htmlcode = self.getHtml(self.detailurl)
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:

View File

@@ -8,6 +8,7 @@ from .parser import Parser
class Dlsite(Parser):
source = 'dlsite'
imagecut = 4
allow_number_change = True
expr_title = '/html/head/title/text()'
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
@@ -26,10 +27,8 @@ class Dlsite(Parser):
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
def search(self, number, core: None):
self.updateCore(core)
def search(self, number):
self.cookies = {'locale': 'zh-cn'}
if "RJ" in number or "VJ" in number:
self.number = number.upper()
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'

View File

@@ -14,10 +14,8 @@ class Fanza(Parser):
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number

View File

@@ -20,10 +20,8 @@ class Fc2(Parser):
expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src"
expr_tags = "//a[@class='tag tagTag']/text()"
def search(self, number, core: None):
def search(self, number):
self.number = number.replace('FC2-', '').replace('fc2-', '')
self.updateCore(core)
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:

View File

@@ -25,10 +25,8 @@ class Gcolle(Parser):
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def search(self, number, core: None):
def search(self, number):
self.number = number.upper().replace('GCOLLE-','')
self.updateCore(core)
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text

View File

@@ -1,8 +1,129 @@
# -*- coding: utf-8 -*-
import re
import json
from urllib.parse import quote
from .parser import Parser
class Getchu(Parser):
class Getchu():
source = 'getchu'
def scrape(self, number, core: None):
dl = dlGetchu()
www = wwwGetchu()
number = number.replace("-C", "")
dic = {}
if "item" in number:
sort = ["dl.scrape(number, core)", "www.scrape(number, core)"]
else:
sort = ["www.scrape(number, core)", "dl.scrape(number, core)"]
for i in sort:
try:
dic = eval(i)
if dic != None and json.loads(dic).get('title') != '':
break
except:
pass
return dic
class wwwGetchu(Parser):
imagecut = 0
allow_number_change = True
cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()'
expr_cover = "/html/body/div[1]/table[2]/tr[1]/td/a/@href"
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_label = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
expr_release = "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
expr_tags = "//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"
expr_outline = "//div[contains(text(),'商品紹介')]/following-sibling::div/text()"
expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def queryNumberUrl(self, number):
self.number = quote(number, encoding="euc_jp")
queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number)
# NOTE dont know why will try 2 times
retry = 2
for i in range(retry):
queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeIndex(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if detailurl:
break
if detailurl == "":
return None
return detailurl.replace('../', 'http://www.getchu.com/')
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
def getCover(self, htmltree):
return "http://www.getchu.com" + super().getCover(htmltree).replace("./", '/')
def getActors(self, htmltree):
return super().getDirector(htmltree)
def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags)
def getOutline(self, htmltree):
outline = ''
_list = self.getAll(htmltree, self.expr_outline)
for i in _list:
outline = outline + i.strip()
return outline
def getExtrafanart(self, htmltree):
arts = super().getExtrafanart(htmltree)
extrafanart = []
for i in arts:
i = "http://www.getchu.com" + i.replace("./", '/')
if 'jpg' in i:
extrafanart.append(i)
return extrafanart
class dlGetchu(wwwGetchu):
imagecut = 4
allow_number_change = True
cookies = {"adult_check_flag": "1"}
extraheader = {"Referer": "https://dl.getchu.com/"}
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
expr_cover = "//td[contains(@bgcolor,'#ffffff')]/img/@src"
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
expr_label = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
expr_runtime = "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()"
expr_release = "//td[contains(text(),'配信開始日')]/following-sibling::td/text()"
expr_tags = "//td[contains(text(),'趣向')]/following-sibling::td/a/text()"
expr_outline = "//*[contains(text(),'作品内容')]/following-sibling::td/text()"
expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href"
expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
def queryNumberUrl(self, number):
if "item" in number or 'GETCHU' in number.upper():
self.number = re.findall('\d+',number)[0]
else:
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeIndex(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
if detailurl == "":
return None
self.number = re.findall('\d+', detailurl)[0]
return self.GETCHU_DL_URL.replace("_WORD_", self.number)
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.number)[0]
def getCover(self, htmltree):
return "https://dl.getchu.com" + super().getCover(htmltree)

View File

@@ -9,7 +9,7 @@ from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies = None, ua: str = None, return_type: str = None, encoding: str = None,
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
网页请求核心函数
@@ -18,7 +18,8 @@ def get(url: str, cookies = None, ua: str = None, return_type: str = None, encod
"""
errors = ""
headers = {"User-Agent": ua or G_USER_AGENT}
if extra_headers != None:
headers.update(extra_headers)
for i in range(retry):
try:
result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies,

View File

@@ -29,10 +29,8 @@ class Javbus(Parser):
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
@@ -139,7 +137,9 @@ class Javbus(Parser):
return ''
def getOutline(self, htmltree):
if self.morestoryline:
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored)
return ''

View File

@@ -43,15 +43,22 @@ class Javdb(Parser):
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
# special
if core.dbcookies:
self.cookies = core.dbcookies
else:
self.cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
if core.dbsite:
self.dbsite = core.dbsite
else:
self.dbsite = 'javdb'
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailurl = self.queryNumberUrl(number)
@@ -61,7 +68,7 @@ class Javdb(Parser):
return result
def queryNumberUrl(self, number):
javdb_url = 'https://javdb.com/search?q=' + number + '&f=all'
javdb_url = 'https://' + self.dbsite + '.com/search?q=' + number + '&f=all'
try:
resp = self.session.get(javdb_url)
except Exception as e:
@@ -148,8 +155,10 @@ class Javdb(Parser):
return r
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree))
return ''
def getStudio(self, htmltree):
try:

View File

@@ -15,10 +15,8 @@ class Madou(Parser):
expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
def search(self, number, core: None):
def search(self, number):
self.number = number.lower().strip()
self.updateCore(core)
self.detailurl = "https://madou.club/" + number + ".html"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:

View File

@@ -23,10 +23,8 @@ class Mgstage(Parser):
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
def search(self, number, core: None):
def search(self, number):
self.number = number.upper()
self.updateCore(core)
self.cookies = {'adc':'1'}
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
self.htmlcode = self.getHtml(self.detailurl)

View File

@@ -11,10 +11,13 @@ class Parser:
source = 'base'
imagecut = 1
uncensored = False
allow_number_change = False
# update
proxies = None
cookies = None
verify = None
extraheader = None
cookies = None
morestoryline = False
number = ''
detailurl = ''
@@ -47,12 +50,15 @@ class Parser:
def __init__(self) -> None:
pass
def search(self, number, core: None):
""" 搜索番号
def scrape(self, number, core: None):
""" 刮削番号
"""
self.number = number
self.updateCore(core)
result = self.search(number)
return result
def search(self, number):
self.number = number
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
@@ -66,6 +72,10 @@ class Parser:
"""
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
def queryNumberUrl(self, number):
""" 根据番号查询详细信息url
@@ -78,7 +88,7 @@ class Parser:
def getHtml(self, url, type = None):
""" 访问网页
"""
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, verify=self.verify, return_type=type)
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type)
if '<title>404 Page Not Found' in resp \
or '<title>未找到页面' in resp \
or '404 Not Found' in resp \

View File

@@ -59,8 +59,10 @@ class Xcity(Parser):
return ''
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, uncensored=False)
return ''
def getActors(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
@@ -111,10 +113,8 @@ class Xcity(Parser):
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def search(self, number, core: None):
def search(self, number):
self.number = number
self.updateCore(core)
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())