add scrapinglib

This commit is contained in:
Mathhew
2022-05-26 14:03:58 +08:00
parent 529aeaddd2
commit b7ecb66210
20 changed files with 2515 additions and 0 deletions

3
scrapinglib/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
from .api import search

133
scrapinglib/airav.py Normal file
View File

@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
import json
import re
from lxml import etree
from bs4 import BeautifulSoup
from .parser import Parser
from .javbus import Javbus
class Airav(Parser):
source = 'airav'
expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()'
expr_studio = '//a[contains(@href,"?video_factory=")]/text()'
expr_release = '//li[contains(text(),"發片日期")]/text()'
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
def search(self, number, core: None):
self.number = number
self.updateCore(core)
self.detailurl = 'https://cn.airav.wiki/video/' + number
engine = Javbus()
javbusinfo = engine.search(number, core)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:
self.javbus = json.loads(javbusinfo)
self.htmlcode = self.getHtml(self.detailurl)
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
# return super().getNum(htmltree)
result = self.javbus.get('number')
if isinstance(result, str) and len(result):
return result
number = super().getNum(htmltree)
result = str(re.findall('^\[(.*?)]', number)[0])
return result
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
def getStudio(self, htmltree):
result = self.javbus.get('studio')
if isinstance(result, str) and len(result):
return result
return super().getStudio(htmltree)
def getRelease(self, htmltree):
result = self.javbus.get('release')
if isinstance(result, str) and len(result):
return result
try:
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
except:
return ''
def getYear(self, htmltree):
result = self.javbus.get('year')
if isinstance(result, str) and len(result):
return result
release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']")
def getOutline(self, htmltree):
return self.getAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree):
result = self.javbus.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
def getDirector(self, htmltree):
result = self.javbus.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getActors(self, htmltree):
b=[]
a = super().getActors(htmltree)
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
result = self.javbus.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getCover(self, htmltree):
result = self.javbus.get('cover')
if isinstance(result, str) and len(result):
return result
return super().getCover(htmltree)
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTags(self, htmltree):
tag = []
soup = BeautifulSoup(self.htmlcode, 'lxml')
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
a = x[0].find_all('a')
for i in a:
tag.append(i.get_text())
return tag
def getSeries(self, htmltree):
result = self.javbus.get('series')
if isinstance(result, str) and len(result):
return result
return ''

173
scrapinglib/api.py Normal file
View File

@@ -0,0 +1,173 @@
# -*- coding: utf-8 -*-
import re
import json
from scrapinglib.airav import Airav
from scrapinglib.carib import Carib
from scrapinglib.dlsite import Dlsite
from scrapinglib.fanza import Fanza
from scrapinglib.gcolle import Gcolle
from scrapinglib.jav321 import Jav321
from scrapinglib.javdb import Javdb
from scrapinglib.mv91 import Mv91
from .fc2 import Fc2
from .madou import Madou
from .mgstage import Mgstage
from .javbus import Javbus
from .xcity import Xcity
from .avsox import Avsox
def search(number, souces=None, proxies=None, dbcookies=None):
"""
TODO 支持更多网站 douban, imdb,tmdb anidb等
type 区分 r18 与 normal
"""
sc = Scraping()
return sc.search(number, souces, proxies=proxies, dbcookies=dbcookies)
class Scraping():
"""
只需要获得内容,不经修改
如果需要翻译等,再针对此方法封装一层
不做 naming rule 处理,放到封装层,保持内部简介
可以指定刮削库,可查询当前支持的刮削库
可查询演员多个艺名
参数:
number
cookies
proxy
sources
TODO multi threading (加速是否会触发反爬?)
[x] translate
[x] naming rule
[x] convert: actress name/tags
"""
full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
'gcolle', 'javdb']
func_mapping = {
'avsox': Avsox().search,
'javbus': Javbus().search,
'xcity': Xcity().search,
'mgstage': Mgstage().search,
'madou': Madou().search,
'fc2': Fc2().search,
'dlsite': Dlsite().search,
'jav321': Jav321().search,
'fanza': Fanza().search,
'airav': Airav().search,
'carib': Carib().search,
'mv91': Mv91().search,
'gcolle': Gcolle().search,
'javdb': Javdb().search,
}
proxies = None
dbcookies = None
def search(self, number, sources=None, proxies=None, dbcookies=None):
self.proxies = proxies
self.dbcookies = dbcookies
sources = self.checkSources(sources, number)
json_data = {}
for source in sources:
try:
print('[+]select', source)
try:
data = self.func_mapping[source](number, self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
print('[!] 出错啦')
print(e)
# json_data = self.func_mapping[source](number, self)
# if any service return a valid return, break
if self.get_data_state(json_data):
print(f"[+]Find movie [{number}] metadata on website '{source}'")
break
except:
break
# Return if data not found in all sources
if not json_data:
print(f'[-]Movie Number [{number}] not found!')
return None
return json_data
def checkSources(self, c_sources, file_number):
if not c_sources:
c_sources = self.full_sources
sources = c_sources.split(',')
def insert(sources,source):
if source in sources:
sources.insert(0, sources.pop(sources.index(source)))
return sources
if len(sources) <= len(self.func_mapping):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources,"carib")
elif "item" in file_number:
sources = insert(sources,"getchu")
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources:
sources = insert(sources,"avsox")
elif "mgstage" in sources and \
(re.match(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources,"mgstage")
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources,"fc2")
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources,"gcolle")
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
sources = insert(sources,"dlsite")
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources:
sources = insert(sources,"xcity")
if "madou" in sources:
sources = insert(sources,"madou")
elif "madou" in sources and (
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources = insert(sources,"madou")
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
return sources
def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True

78
scrapinglib/avsox.py Normal file
View File

@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
import re
from .parser import Parser
class Avsox(Parser):
source = 'avsox'
imagecut = 3
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
expr_actor = '//a[@class="avatar-box"]'
expr_actorphoto = '//a[@class="avatar-box"]'
expr_title = '/html/body/div[2]/h3/text()'
expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()'
expr_release = '//span[contains(text(),"发行时间:")]/../text()'
expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src'
expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
def queryNumberUrl(self, number):
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
return "https:" + result1
def getNum(self, htmltree):
new_number = self.getTreeIndex(htmltree, self.expr_number)
if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source)
self.number = new_number
return new_number
def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', '').strip(self.number)
def getStudio(self, htmltree):
return super().getStudio(htmltree).replace("', '", ' ')
def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图
"""
return self.getTreeIndex(self.searchtree, self.expr_smallcover)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree):
from .storyline import getStoryline
return getStoryline(self.number)
def getActors(self, htmltree):
a = super().getActors(htmltree)
d = []
for i in a:
d.append(i.find('span').text)
return d
def getActorPhoto(self, htmltree):
a = super().getActorPhoto(htmltree)
d = {}
for i in a:
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d

101
scrapinglib/carib.py Normal file
View File

@@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
import re
from urllib.parse import urljoin
from lxml import html
from .parser import Parser
class Carib(Parser):
source = 'carib'
uncensored = True
expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
expr_release = "//li[2]/span[@class='spec-content']/text()"
expr_runtime = "//span[@class='spec-content']/span[@itemprop='duration']/text()"
expr_actor = "//span[@class='spec-content']/a[@itemprop='actor']/span/text()"
expr_tags = "//span[@class='spec-content']/a[@itemprop='genre']/text()"
expr_extrafanart = "//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href"
expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
def search(self, number, core: None):
self.number = number
self.updateCore(core)
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
htmlcode = self.getHtml(self.detailurl)
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
return 404
htmltree = html.fromstring(htmlcode)
result = self.dictformat(htmltree)
return result
def getStudio(self, htmltree):
return '加勒比'
def getActors(self, htmltree):
r = []
actors = super().getActors(htmltree)
for act in actors:
if str(act) != '':
r.append(act)
return r
def getNum(self, htmltree):
return self.number
def getCover(self, htmltree):
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
r = []
genres = self.getAll(htmltree, self.expr_extrafanart)
for g in genres:
jpg = str(g)
if '/member/' in jpg:
break
else:
r.append('https://www.caribbeancom.com' + jpg)
return r
def getActorPhoto(self, htmltree):
# return super().getActorPhoto(htmltree)
htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
t = {}
for name, a in zip(names, htmla):
if name.strip() == '':
continue
p = {name.strip(): a.attrib['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object')
if not r.ok:
continue
html = r.text
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(r.url, cssBGjpgs[0])}
o.update(p)
return o
def getOutline(self, htmltree):
from .storyline import getStoryline
result = getStoryline(self.number, uncensored=self.uncensored)
if len(result):
return result
return super().getOutline(htmltree)

96
scrapinglib/dlsite.py Normal file
View File

@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Dlsite(Parser):
source = 'dlsite'
imagecut = 4
expr_title = '/html/head/title/text()'
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
expr_studio = '//th[contains(text(),"商标名")]/../td/span[1]/a/text()'
expr_studio2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
expr_outline = '//*[@class="work_parts_area"]/p/text()'
expr_series = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
expr_series2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_director = '//th[contains(text(),"剧情")]/../td/a/text()'
expr_release = '//th[contains(text(),"贩卖日")]/../td/a/text()'
expr_cover = '//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset'
expr_tags = '//th[contains(text(),"分类")]/../td/div/a/text()'
expr_label = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
def search(self, number, core: None):
self.updateCore(core)
self.cookies = {'locale': 'zh-cn'}
if "RJ" in number or "VJ" in number:
self.number = number.upper()
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
htmltree = self.getHtmlTree(self.detailurl)
else:
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
htmltree = self.getHtmlTree(self.detailurl)
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
if "" in number:
number = number.replace("","")
elif "" in number:
number = number.replace("","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
self.detailurl = search_result[0]
htmltree = self.getHtmlTree(self.detailurl)
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return self.number
def getTitle(self, htmltree):
result = super().getTitle(htmltree)
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
result = result.replace('【HD版】', '')
return result
def getOutline(self, htmltree):
total = []
result = self.getAll(htmltree, self.expr_outline)
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getRelease(self, htmltree):
return super().getRelease(htmltree).replace('','-').replace('','-').replace('','')
def getCover(self, htmltree):
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
try:
result = []
for i in self.getAll(self.expr_extrafanart):
result.append("https:" + i)
except:
result = ''
return result

152
scrapinglib/fanza.py Normal file
View File

@@ -0,0 +1,152 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urlencode
from .parser import Parser
class Fanza(Parser):
source = 'fanza'
expr_title = '//*[starts-with(@id, "title")]/text()'
expr_outline = "//div[@class='mg-b20 lh4']/text()"
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number, core: None):
self.number = number
self.updateCore(core)
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
for url in fanza_urls:
self.detailurl = url + fanza_search_number
url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
self.htmlcode = self.getHtml(url)
if self.htmlcode != 404:
self.htmltree = etree.HTML(self.htmlcode)
break
if self.htmlcode == 404:
return 404
result = self.dictformat(self.htmltree)
return result
def getNum(self, htmltree):
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
self.fanza_hinban = self.getFanzaString('品番:')
self.number = self.fanza_hinban
number_lo = self.number.lower()
if (re.sub('-|_', '', number_lo) == self.fanza_hinban or
number_lo.replace('-', '00') == self.fanza_hinban or
number_lo.replace('-', '') + 'so' == self.fanza_hinban
):
self.number = self.number
return self.number
def getStudio(self, htmltree):
return self.getFanzaString('メーカー')
def getOutline(self, htmltree):
try:
result = self.getTreeIndex(htmltree, self.expr_outline).replace("\n", "")
if result == '':
result = self.getTreeIndex(htmltree, self.expr_outline2).replace("\n", "")
return result
except:
return ''
def getRuntime(self, htmltree):
return str(re.search(r'\d+', super().getRuntime(htmltree)).group()).strip(" ['']")
def getDirector(self, htmltree):
if "anime" not in self.detailurl:
return self.getFanzaString('監督:')
return ''
def getActors(self, htmltree):
if "anime" not in self.detailurl:
return super().getActors(htmltree).replace("', '", ",")
return ''
def getRelease(self, htmltree):
result = self.getFanzaString('発売日:')
if result == '' or result == '----':
result = self.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getCover(self, htmltree):
# return super().getCover(htmltree)
cover_number = self.fanza_hinban
try:
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href')
except:
# sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f")
try:
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href')
except:
# (TODO) handle more edge case
# print(html)
# raise exception here, same behavior as before
# people's major requirement is fetching the picture
raise ValueError("can not find image")
return result
def getTags(self, htmltree):
return self.getFanzaStrings('ジャンル:')
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for img_url in extrafanart_imgs:
img_urls = img_url.rsplit('-', 1)
img_url = img_urls[0] + 'jp-' + img_urls[1]
s.append(img_url)
return s
return ''
def getLabel(self, htmltree):
return self.getFanzaStrings('レーベル')
def getSeries(self, htmltree):
return self.getFanzaStrings('シリーズ:')
def getFanzaString(self, expr):
result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getFanzaStrings(self, string):
result1 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2

73
scrapinglib/fc2.py Normal file
View File

@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urljoin
from .parser import Parser
class Fc2(Parser):
source = 'fc2'
imagecut = 0
expr_title = '/html/head/title/text()'
expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_release = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()'
expr_runtime = "//p[@class='items_article_info']/text()"
expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src"
expr_tags = "//a[@class='tag tagTag']/text()"
def search(self, number, core: None):
self.number = number.replace('FC2-', '').replace('fc2-', '')
self.updateCore(core)
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.HTML(self.htmlcode)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return 'FC2-' + self.number
def getRelease(self, htmltree):
return super().getRelease(htmltree).strip(" ['販売日 : ']").replace('/','-')
def getActors(self, htmltree):
actors = super().getActors(htmltree)
if not actors:
actors = '素人'
return actors
def getCover(self, htmltree):
return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree))
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTrailer(self, htmltree):
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
video = video_pather.findall(self.htmlcode)
if video:
try:
video_url = video[0].replace('\'', '')
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + self.number + '/sample?key=' + video_url
url_json = eval(self.getHtml(video_url))['path'].replace('\\', '')
return url_json
except:
return ''
else:
return ''

75
scrapinglib/gcolle.py Normal file
View File

@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .httprequest import get_html_session
from .parser import Parser
class Gcolle(Parser):
source = 'gcolle'
imagecut = 4
expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href'
expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()'
expr_title = '//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'
expr_studio = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_director = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_actor = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_label = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_series = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_release = '//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'
expr_cover = '//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'
expr_tags = '//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'
expr_outline = '//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def search(self, number, core: None):
self.number = number.upper().replace('GCOLLE-','')
self.updateCore(core)
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
htmltree = etree.HTML(htmlcode)
r18url = self.getTreeIndex(htmltree, self.expr_r18)
if r18url and r18url.startswith('http'):
htmlcode = session.get(r18url).text
htmltree = etree.HTML(htmlcode)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
num = super().getNum(htmltree)
if self.number != num:
raise Exception(f'[!] {self.number}: find [{num}] in gcolle, not match')
return "GCOLLE-" + str(num)
def getOutline(self, htmltree):
result = self.getAll(htmltree, self.expr_outline)
try:
return "\n".join(result)
except:
return ""
def getRelease(self, htmltree):
return re.findall('\d{4}-\d{2}-\d{2}', super().getRelease(htmltree))[0]
def getCover(self, htmltree):
return "https:" + super().getCover(htmltree)
def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
extrafanart = self.getAll(htmltree, self.expr_extrafanart)
if len(extrafanart) == 0:
extrafanart = self.getAll(htmltree, self.expr_extrafanart2)
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
return extrafanart

8
scrapinglib/getchu.py Normal file
View File

@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
from .parser import Parser
class Getchu(Parser):
source = 'getchu'

251
scrapinglib/httprequest.py Normal file
View File

@@ -0,0 +1,251 @@
# -*- coding: utf-8 -*-
import mechanicalsoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies = None, ua: str = None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
网页请求核心函数
是否使用代理应由上层处理
"""
errors = ""
headers = {"User-Agent": ua or G_USER_AGENT}
for i in range(retry):
try:
result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies,
verify=verify,
cookies=cookies)
if return_type == "object":
return result
elif return_type == "content":
return result.content
else:
result.encoding = encoding or result.apparent_encoding
return result.text
except Exception as e:
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
errors = str(e)
if "getaddrinfo failed" in errors:
print("[-]Connect Failed! Please Check your proxy config")
print("[-]" + errors)
else:
print("[-]" + errors)
print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed')
def post(url: str, data: dict, cookies = None, ua: str = None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
是否使用代理应由上层处理
"""
errors = ""
headers = {"User-Agent": ua or G_USER_AGENT}
for i in range(retry):
try:
result = requests.post(url, data=data, headers=headers, timeout=timeout, proxies=proxies,
verify=verify,
cookies=cookies)
if return_type == "object":
return result
elif return_type == "content":
return result.content
else:
result.encoding = encoding or result.apparent_encoding
return result
except Exception as e:
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
errors = str(e)
if "getaddrinfo failed" in errors:
print("[-]Connect Failed! Please Check your proxy config")
print("[-]" + errors)
else:
print("[-]" + errors)
print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed')
#
# TODO: 以下临时使用,更新完各站后,再更新
#
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT
if "timeout" in kwargs:
self.timeout = kwargs["timeout"]
del kwargs["timeout"]
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None:
kwargs["timeout"] = self.timeout
return super().send(request, **kwargs)
# with keep-alive feature
# storyline carib gcolle javdb only
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = requests.Session()
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
session.headers = {"User-Agent": ua or G_USER_AGENT}
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
# storyline only
# 使用 cloudscraper....
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, use_scraper: bool = False,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
if isinstance(url, str) and len(url):
result = browser.open(url)
else:
return browser
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "browser":
return result, browser
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_browser() Failed! {e}')
return None
# storyline xcity only
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
result = browser.open(url)
if not result.ok:
return None
form = browser.select_form() if form_select is None else browser.select_form(form_select)
if isinstance(fields, dict):
for k, v in fields.items():
browser[k] = v
response = browser.submit_selected()
if return_type == "object":
return response
elif return_type == "content":
return response.content
elif return_type == "browser":
return response, browser
else:
result.encoding = encoding or "utf-8"
return response.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_form() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_form() Failed! {e}')
return None
# storyline javdb only
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, proxies=None, timeout: int = G_DEFAULT_TIMEOUT, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用scraper对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "scraper":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_by_scraper() failed. {e}")
return None

85
scrapinglib/jav321.py Normal file
View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from scrapinglib import httprequest
from .parser import Parser
class Jav321(Parser):
source = 'jav321'
expr_title = "/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()"
expr_cover = "/html/body/div[2]/div[2]/div[1]/p/a/img/@src"
expr_outline = "/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()"
# NOTE: 统一使用 xpath
expr_number = '//b[contains(text(),"品番")]/following-sibling::node()'
expr_actor = '//b[contains(text(),"出演者")]/following-sibling::a[starts-with(@href,"/star")]'
expr_label = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
expr_tags = '//b[contains(text(),"ジャンル")]/following-sibling::a[starts-with(@href,"/genre")]'
expr_studio = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
expr_release = '//b[contains(text(),"配信開始日")]/following-sibling::node()'
expr_runtime = '//b[contains(text(),"収録時間")]/following-sibling::node()'
# expr_series = '//b[contains(text(),"シリーズ")]'
def queryNumberUrl(self, number):
return 'https://www.jav321.com/search'
def getHtmlTree(self, url):
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if "/video/" in resp.url:
self.detailurl = resp.url
self.detailhtml = resp.text
return etree.fromstring(resp.text, etree.HTMLParser())
return None
def getNum(self, htmltree):
return super().getNum(htmltree).split(": ")[1]
def getTrailer(self, htmltree):
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
videourl = videourl_pather.findall(self.detailhtml)
if videourl:
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
return url
else:
return ''
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
html = html_pather.search(self.detailhtml)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getRelease(self, htmltree):
return super().getRelease(htmltree).split(": ")[1]
def getRuntime(self, htmltree):
return super().getRuntime(htmltree).split(": ")[1]
def parseElement(self, all):
if all:
ret = []
for si in all:
ret.append(si.text)
return ",".join(ret)
return ''
def getActors(self, htmltree):
return self.parseElement(super().getActors(htmltree))
def getLabel(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_label))
def getTags(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_tags))
def getStudio(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_studio))

145
scrapinglib/javbus.py Normal file
View File

@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
import re
import os
import secrets
import inspect
from lxml import etree
from urllib.parse import urljoin
from .parser import Parser
class Javbus(Parser):
source = 'javbus'
expr_number = '/html/head/meta[@name="keywords"]/@content'
expr_title = '/html/head/title/text()'
expr_studio = '//span[contains(text(),"製作商:")]/../a/text()'
expr_studio2 = '//span[contains(text(),"メーカー:")]/../a/text()'
expr_director = '//span[contains(text(),"導演:")]/../a/text()'
expr_directorJa = '//span[contains(text(),"監督:")]/../a/text()'
expr_series = '//span[contains(text(),"系列:")]/../a/text()'
expr_series2 = '//span[contains(text(),"シリーズ:")]/../a/text()'
expr_label = '//span[contains(text(),"系列:")]/../a/text()'
expr_cover = '//a[@class="bigImage"]/@href'
expr_release = '/html/body/div[5]/div[1]/div[2]/p[2]/text()'
expr_runtime = '/html/body/div[5]/div[1]/div[2]/p[3]/text()'
expr_actor = '//div[@class="star-name"]/a'
expr_actorphoto = '//div[@class="star-name"]/../a/img'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
def search(self, number, core: None):
self.number = number
self.updateCore(core)
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try:
self.detailurl = url + number
self.htmlcode = self.getHtml(self.detailurl)
except:
self.detailurl = 'https://www.javbus.com/' + number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode,etree.HTMLParser())
result = self.dictformat(htmltree)
return result
except:
self.searchUncensored(number)
def searchUncensored(self, number):
""" 二次搜索无码
"""
self.imagecut = 0
self.uncensored = True
w_number = number.replace('.', '-')
self.detailurl = 'https://www.javbus.red/' + w_number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return super().getNum(htmltree).split(',')[0]
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudio(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_studio2)
else:
return self.getTreeIndex(htmltree, self.expr_studio)
def getCover(self, htmltree):
return urljoin("https://www.javbus.com", super().getCover(htmltree))
def getRelease(self, htmltree):
return super().getRelease(htmltree).strip(" ['']")
def getRuntime(self, htmltree):
return super().getRuntime(htmltree).strip(" ['']分鐘")
def getActors(self, htmltree):
actors = super().getActors(htmltree)
b=[]
for i in actors:
b.append(i.attrib['title'])
return b
def getActorPhoto(self, htmltree):
actors = super().getActorPhoto(htmltree)
d = {}
for i in actors:
p = i.attrib['src']
if "nowprinting.gif" in p:
continue
t = i.attrib['title']
d[t] = urljoin("https://www.javbus.com", p)
return d
def getDirector(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_directorJa)
else:
return self.getTreeIndex(htmltree, self.expr_director)
def getSeries(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_series2)
else:
return self.getTreeIndex(htmltree, self.expr_series)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return tags[1:]
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getOutline(self, htmltree):
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored)

229
scrapinglib/javdb.py Normal file
View File

@@ -0,0 +1,229 @@
# -*- coding: utf-8 -*-
import re
from urllib.parse import urljoin
from lxml import etree
from requests import session
from .httprequest import get_html_session
from .parser import Parser
class Javdb(Parser):
source = 'javdb'
fixstudio = False
expr_number = '//strong[contains(text(),"番號")]/../span/text()'
expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
expr_title = "/html/head/title/text()"
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
expr_uncensored = '//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?") or contains(@href,"/tags/western?")]'
expr_actor = '//span[@class="value"]/a[contains(@href,"/actors/")]/text()'
expr_actor2 = '//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class'
expr_release = '//strong[contains(text(),"日期")]/../span/text()'
expr_studio = '//strong[contains(text(),"片商")]/../span/a/text()'
expr_studio2 = '//strong[contains(text(),"賣家:")]/../span/a/text()'
expr_director = '//strong[contains(text(),"導演")]/../span/text()'
expr_director2 = '//strong[contains(text(),"導演")]/../span/a/text()'
expr_cover = "//div[contains(@class, 'column-video-cover')]/a/img/@src"
expr_cover2 = "//div[contains(@class, 'column-video-cover')]/img/@src"
expr_extrafanart = "//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href"
expr_tags = '//strong[contains(text(),"類別")]/../span/a/text()'
expr_tags2 = '//strong[contains(text(),"類別")]/../span/text()'
expr_series = '//strong[contains(text(),"系列")]/../span/text()'
expr_series2 = '//strong[contains(text(),"系列")]/../span/a/text()'
expr_label = '//strong[contains(text(),"系列")]/../span/text()'
expr_label2 = '//strong[contains(text(),"系列")]/../span/a/text()'
expr_userrating = '//span[@class="score-stars"]/../text()'
expr_uservotes = '//span[@class="score-stars"]/../text()'
expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]'
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
if core.dbcookies:
self.cookies = core.dbcookies
else:
self.cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
def search(self, number, core: None):
self.number = number
self.updateCore(core)
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailurl = self.queryNumberUrl(number)
self.deatilpage = self.session.get(self.detailurl).text
htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def queryNumberUrl(self, number):
javdb_url = 'https://javdb.com/search?q=' + number + '&f=all'
try:
resp = self.session.get(javdb_url)
except Exception as e:
print(e)
raise Exception(f'[!] {self.number}: page not fond in javdb')
htmltree = etree.fromstring(resp.text, etree.HTMLParser())
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one
urls = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/@href')
# 记录一下欧美的ids ['Blacked','Blacked']
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
correct_url = urls[0]
else:
ids = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
try:
correct_url = urls[ids.index(number)]
except:
# 为避免获得错误番号,只要精确对应的结果
if ids[0].upper() != number:
raise ValueError("number not found in javdb")
correct_url = urls[0]
return urljoin(resp.url, correct_url)
def getNum(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']")
dp_number = str(result2 + result1).strip('+')
# NOTE 检测匹配与更新 self.number
if dp_number.upper() != self.number.upper():
raise Exception(f'[!] {self.number}: find [{dp_number}] in javdb, not match')
self.number = dp_number
return self.number
def getTitle(self, htmltree):
browser_title = super().getTitle(htmltree)
title = browser_title[:browser_title.find(' | JavDB')].strip()
return title.replace(self.number, '').strip()
def getRuntime(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getDirector(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_director)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_director2)).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getSeries(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_series)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_series2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio:
result = self.getStudio(htmltree)
return result
def getLabel(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_label)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_label2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio:
result = self.getStudio(htmltree)
return result
def getActors(self, htmltree):
actors = self.getAll(htmltree, self.expr_actor)
genders = self.getAll(htmltree, self.expr_actor2)
r = []
idx = 0
# NOTE only female, we dont care others
actor_gendor = 'female'
for act in actors:
if((actor_gendor == 'all')
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
r.append(act)
idx = idx + 1
if re.match(r'FC2-[\d]+', self.number, re.A) and not r:
r = '素人'
self.fixstudio = True
return r
def getOutline(self, htmltree):
from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree))
def getStudio(self, htmltree):
try:
return self.getAll(htmltree, self.expr_studio).strip(" ['']")
except:
pass
try:
return self.getAll(htmltree, self.expr_studio2).strip(" ['']")
except:
return ''
def getTrailer(self, htmltree):
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
video = video_pather.findall(self.deatilpage)
# 加上数组判空
if video and video[0] != "":
if not 'https:' in video[0]:
video_url = 'https:' + video[0]
else:
video_url = video[0]
else:
video_url = ''
return video_url
def getTags(self, htmltree):
try:
return self.getAll(htmltree, self.expr_tags)
except:
pass
try:
return self.getAll(htmltree, self.expr_tags2)
except:
return ''
def getUserRating(self, htmltree):
try:
result = str(self.getTreeIndex(htmltree, self.expr_userrating))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0])
except:
return
def getUserVotes(self, htmltree):
try:
result = str(self.getTreeIndex(htmltree, self.expr_uservotes))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return int(v[0][1])
except:
return
def getaphoto(self, url, session):
html_page = session.get(url).text
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
return img_url[0] if img_url else ''
def getActorPhoto(self, htmltree):
actorall = self.getAll(htmltree, self.expr_actorphoto)
if not actorall:
return {}
actors = self.getActors(htmltree)
actor_photo = {}
for i in actorall:
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
if not len(x) or not len(x[0]) or i.text not in actors:
continue
# NOTE: https://c1.jdbstatic.com 会经常变动,直接使用页面内的地址获取
# actor_id = x[0]
# pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
# if not self.session.head(pic_url).ok:
try:
pic_url = self.getaphoto(urljoin('https://javdb.com', i.attrib['href']), self.session)
if len(pic_url):
actor_photo[i.text] = pic_url
except:
pass
return actor_photo

65
scrapinglib/madou.py Normal file
View File

@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urlparse, unquote
from .parser import Parser
class Madou(Parser):
source = 'madou'
uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url'
expr_title = "/html/head/title/text()"
expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
def search(self, number, core: None):
self.number = number.lower().strip()
self.updateCore(core)
self.detailurl = "https://madou.club/" + number + ".html"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
self.detailurl = self.getTreeIndex(htmltree, self.expr_url)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
try:
# 解码url
filename = unquote(urlparse(self.detailurl).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
# 移除中文
if result.upper() != self.number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
# 移除多余的符号
return result.strip('-')
except:
return ''
def getTitle(self, htmltree):
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(super().getTitle(htmltree))
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getCover(self, htmltree):
try:
url = str(re.findall("shareimage : '(.*?)'", self.htmlcode)[0])
return url.strip()
except:
return ''
def getTags(self, htmltree):
studio = self.getStudio(htmltree)
x = super().getTags(htmltree).split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]

83
scrapinglib/mgstage.py Normal file
View File

@@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from bs4 import BeautifulSoup
from .parser import Parser
class Mgstage(Parser):
source = 'mgstage'
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
expr_outline = '//p/text()'
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
expr_release = '//th[contains(text(),"配信開始日:")]/../td/a/text()'
expr_cover = '//*[@id="EnlargeImage"]/@href'
expr_label = '//th[contains(text(),"シリーズ:")]/../td/a/text()'
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
def search(self, number, core: None):
self.number = number.upper()
self.updateCore(core)
self.cookies = {'adc':'1'}
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
self.htmlcode = self.getHtml(self.detailurl)
soup = BeautifulSoup(self.htmlcode, 'lxml')
self.detailpage = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
self.htmlcodetree = etree.HTML(self.htmlcode)
self.detailtree = etree.HTML(self.detailpage)
self.introtree = etree.HTML(b2)
result = self.dictformat(self.detailtree)
return result
def getTitle(self, htmltree):
return super().getTitle(self.htmlcodetree).replace('/', ',').replace("\\n",'').replace(' ', '').strip()
def getOutline(self, htmltree):
return super().getOutline(self.introtree).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
def getCover(self, htmltree):
return super().getCover(self.htmlcodetree)
def getTags(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(self.getAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTreeIndex(self, tree, expr, index=0):
if expr == '':
return ''
if tree == self.detailtree:
# NOTE: 合并 getMgsString
result1 = str(tree.xpath(expr)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
result2 = str(tree.xpath(expr.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
else:
result = tree.xpath(expr)
try:
return result[index]
except:
return ''

93
scrapinglib/mv91.py Normal file
View File

@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Mv91(Parser):
source = 'mv91'
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
expr_release = '//p[@class="date"]/text()'
expr_outline = '//div[@class="play-text"]/text()'
expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()'
def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404:
return 404
ret = etree.fromstring(self.htmlcode, etree.HTMLParser())
return ret
def queryNumberUrl(self, number):
keyword = number.replace('91CM-','').replace('91MS-','')
search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
endurl = html.xpath('//a[@class="video-list"]/@href')[0]
return 'https://www.91mv.org' + endurl
def getNum(self, htmltree):
try:
num = super().getNum(htmltree)
finds = re.findall('(.*)(91.*-\d*)',num)
if finds:
result = str(finds[0][1])
else:
result = ' '.join(num.replace('/',' ').split())
result = result.split()[1]
if self.number.upper() != result.upper():
raise Exception(f'[!] {self.number}: find {result} in mv91, not match')
return result.strip()
except:
return ''
def getTitle(self, htmltree):
try:
title = super().getTitle(htmltree)
finds = re.findall('(.*)(91.*-\d*)',title)
if finds:
result = str(finds[0][0])
else:
result = ' '.join(title.replace('/',' ').split())
result = result.split()[0].replace('「预告」','')
return result.strip()
except:
return ''
def getStudio(self, htmltree):
return '91制片厂'
def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags)
def getActors(self, htmltree):
b=[]
for player in self.getAll(htmltree, self.expr_actor):
player = player.replace('主演:','')
if '/' in player:
player = player.split('/')[0]
player = re.sub(r'[0-9]+', '', player)
b.append(player)
return b
def getRelease(self, htmltree):
try:
result = super().getRelease(htmltree)
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
pass
return ''
def getCover(self, htmltree):
try:
url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0])
return url.strip()
except:
return ''

259
scrapinglib/parser.py Normal file
View File

@@ -0,0 +1,259 @@
# -*- coding: utf-8 -*-
import json
import re
from lxml import etree, html
from . import httprequest
class Parser:
source = 'base'
imagecut = 1
uncensored = False
# update
proxies = None
cookies = None
verify = None
number = ''
detailurl = ''
# xpath expr
expr_number = ''
expr_title = ''
expr_studio = ''
expr_studio2 = ''
expr_runtime = ''
expr_runtime2 = ''
expr_release = ''
expr_outline = ''
expr_director = ''
expr_actor = ''
expr_tags = ''
expr_label = ''
expr_label2 = ''
expr_series = ''
expr_series2 = ''
expr_cover = ''
expr_cover2 = ''
expr_smallcover = ''
expr_extrafanart = ''
expr_trailer = ''
expr_actorphoto = ''
expr_uncensored = ''
expr_userrating = ''
expr_uservotes = ''
def __init__(self) -> None:
pass
def search(self, number, core: None):
""" 搜索番号
"""
self.number = number
self.updateCore(core)
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
return result
def updateCore(self, core):
""" 从`core`内更新参数
针对需要传递的参数: cookies, proxy等
子类继承后修改
"""
if core.proxies:
self.proxies = core.proxies
def queryNumberUrl(self, number):
""" 根据番号查询详细信息url
备份查询页面,预览图可能需要
"""
url = httprequest.get(number)
return url
def getHtml(self, url, type = None):
""" 访问网页
"""
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, verify=self.verify, return_type=type)
if '<title>404 Page Not Found' in resp \
or '<title>未找到页面' in resp \
or '404 Not Found' in resp \
or '<title>404' in resp \
or '<title>お探しの商品が見つかりません' in resp:
return 404
return resp
def getHtmlTree(self, url, type = None):
""" 访问网页,返回`etree`
"""
resp = self.getHtml(url, type)
if resp == 404:
return 404
ret = etree.fromstring(resp, etree.HTMLParser())
return ret
def dictformat(self, htmltree):
try:
dic = {
'number': self.getNum(htmltree),
'title': self.getTitle(htmltree),
'studio': self.getStudio(htmltree),
'year': self.getYear(htmltree),
'outline': self.getOutline(htmltree),
'runtime': self.getRuntime(htmltree),
'director': self.getDirector(htmltree),
'actor': self.getActors(htmltree),
'release': self.getRelease(htmltree),
'cover': self.getCover(htmltree),
'cover_small': self.getSmallCover(htmltree),
'extrafanart': self.getExtrafanart(htmltree),
'trailer': self.getTrailer(htmltree),
'imagecut': self.imagecut,
'tag': self.getTags(htmltree),
'label': self.getLabel(htmltree),
'actor_photo': self.getActorPhoto(htmltree),
'website': self.detailurl,
'source': self.source,
'series': self.getSeries(htmltree),
'uncensored': self.getUncensored(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree)
}
except Exception as e:
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
def getNum(self, htmltree):
""" 增加 strip 过滤
"""
return self.getTreeIndex(htmltree, self.expr_number)
def getTitle(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_title).strip()
def getStudio(self, htmltree):
try:
return self.getTreeIndex(htmltree, self.expr_studio).strip(" ['']")
except:
pass
try:
return self.getTreeIndex(htmltree, self.expr_studio2).strip(" ['']")
except:
return ''
def getYear(self, htmltree):
""" year基本都是从release中解析的
"""
try:
release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']")
except:
return release
def getRuntime(self, htmltree):
try:
return self.getTreeIndex(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
except:
pass
try:
return self.getTreeIndex(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
except:
return ''
def getRelease(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_release).strip().replace('/','-')
def getOutline(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_outline).strip().replace("\n","")
def getDirector(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_director)
def getActors(self, htmltree):
return self.getAll(htmltree, self.expr_actor)
def getTags(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_tags)
def getLabel(self, htmltree):
try:
return self.getTreeIndex(htmltree, self.expr_label).strip(" ['']")
except:
pass
try:
return self.getTreeIndex(htmltree, self.expr_label2).strip(" ['']")
except:
return ''
def getSeries(self, htmltree):
try:
return self.getTreeIndex(htmltree, self.expr_series).strip(" ['']")
except:
pass
try:
return self.getTreeIndex(htmltree, self.expr_series2).strip(" ['']")
except:
return ''
def getCover(self, htmltree):
try:
return self.getTreeIndex(htmltree, self.expr_cover).strip(" ['']")
except:
pass
try:
return self.getTreeIndex(htmltree, self.expr_cover2).strip(" ['']")
except:
return ''
def getSmallCover(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_smallcover)
def getExtrafanart(self, htmltree):
return self.getAll(htmltree, self.expr_extrafanart)
def getTrailer(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_trailer)
def getActorPhoto(self, htmltree):
return self.getAll(htmltree, self.expr_actorphoto)
def getUncensored(self, htmlree):
if self.expr_uncensored:
u = self.getAll(htmlree, self.expr_uncensored)
return bool(u)
else:
return self.uncensored
def getUserRating(self, htmltree):
return self.getAll(htmltree, self.expr_userrating)
def getUserVotes(self, htmltree):
return self.getAll(htmltree, self.expr_uservotes)
def getTreeIndex(self, tree: html.HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
"""
if expr == '':
return ''
result = tree.xpath(expr)
try:
return result[index]
except:
return ''
def getAll(self, tree: html.HtmlElement, expr):
""" 根据表达式从`xmltree`中获取全部匹配值
"""
if expr == '':
return ''
result = tree.xpath(expr)
try:
return result
except:
return ''

291
scrapinglib/storyline.py Normal file
View File

@@ -0,0 +1,291 @@
# -*- coding: utf-8 -*-
"""
此部分暂未修改
"""
import os
import re
import time
import secrets
import builtins
from urllib.parse import urljoin
from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
# 舍弃 Amazon 源
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
G_mode_txt = ('顺序执行','线程池')
def is_japanese(raw: str) -> bool:
"""
日语简单检测
"""
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
class noThread(object):
def map(self, fn, param):
return list(builtins.map(fn, param))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title = None, sites: list=None, uncensored=None):
start_time = time.time()
debug = False
storyine_sites = "1:avno1,4:airavwiki".split(',')
if uncensored:
storyine_sites += "3:58avgo".split(',')
else:
storyine_sites += "2:airav,5:xcity".split(',')
r_dup = set()
sort_sites = []
for s in storyine_sites:
ns = re.sub(r'.*?:', '', s, re.A)
if ns in G_registered_storyline_site and ns not in r_dup:
sort_sites.append(s)
r_dup.add(ns)
sort_sites.sort()
apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites]
mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = 1
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
# 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = ''
for site, desc in zip(apply_sites, results):
if isinstance(desc, str) and len(desc):
if not is_japanese(desc):
sel_site, sel = site, desc
break
if not len(sel_site):
sel_site, sel = site, desc
for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0
s += f'[选中{site}字数:{sl}]' if site == sel_site else f'{site}字数:{sl}' if sl else f'{site}:空'
print(s)
return sel
def getStoryline_mp(args):
(site, number, title, debug) = args
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
# elif site == "amazon":
# storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
try:
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, session = get_html_session(url, return_type='session')
if not res:
raise ValueError(f"get_html_by_session('{url}') failed")
lx = fromstring(res.text)
urls = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/@href')
txts = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/h3[@class="one_name ga_name"]/text()')
detail_url = None
for txt, url in zip(txts, urls):
if re.search(number, txt, re.I):
detail_url = urljoin(res.url, url)
break
if detail_url is None:
raise ValueError("number not found")
res = session.get(detail_url)
if not res.ok:
raise ValueError(f"session.get('{detail_url}') failed")
lx = fromstring(res.text)
t = str(lx.xpath('/html/head/title/text()')[0]).strip()
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0])
if not re.search(number, airav_number, re.I):
raise ValueError(f"page number ->[{airav_number}] not match")
desc = str(lx.xpath('//span[@id="ContentPlaceHolder1_Label2"]/text()')[0]).strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getStoryline_airav Error: {e},number [{number}].")
pass
return None
def getStoryline_airavwiki(number, debug):
try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://cn.airav.wiki/?search={kwd}'
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True)
if not result.ok:
raise ValueError(f"get_html_by_browser('{url}','{number}') failed")
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block')
link = None
for a in s:
title = a.img['title']
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
pass
return ''
def getStoryline_58avgo(number, debug):
try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
'?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser')
if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None
for a in s:
title = a.h3.text.strip()
list_number = title[title.rfind(' ')+1:].strip()
if re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found")
title = browser.page.select_one('head > title').text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
return browser.page.select_one('#ContentPlaceHolder1_Label2').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
'hotav.biz','iqq2.xyz','javhq.tv',
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
lx = fromstring(get_html_by_scraper(url))
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs):
raise ValueError(f"number not found")
partial_num = bool(re.match(r'\d{6}[\-_]\d{2,3}', number))
for title, desc in zip(titles, descs):
page_number = title[title.rfind(' ')+1:].strip()
if not partial_num:
if re.match(f'^{number}$', page_number, re.I):
return desc.strip()
elif re.search(number, page_number, re.I):
return desc.strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number},
return_type = 'browser')
if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
s = browser.page.select('div.type_movie > div > ul > li > div')
for div in s:
title = div.a.h3.text.strip()
page_number = title[title.rfind(' ')+1:].strip()
if re.search(number, page_number, re.I):
return div['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
try:
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''

122
scrapinglib/xcity.py Normal file
View File

@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
import re
import secrets
from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_by_form
from .parser import Parser
class Xcity(Parser):
source = 'xcity'
expr_number = '//*[@id="hinban"]/text()'
expr_title = '//*[@id="program_detail_title"]/text()'
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
expr_label = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()'
expr_release = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()'
expr_tags = '//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()'
expr_cover = '//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href'
expr_director = '//*[@id="program_detail_director"]/text()'
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
def getStudio(self, htmltree):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
def getRuntime(self, htmltree):
return self.getAll(htmltree, self.expr_runtime)[1].strip()
def getRelease(self, htmltree):
try:
result = self.getTreeIndex(htmltree, self.expr_release, 1)
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
except:
return ''
def getTags(self, htmltree):
result = self.getAll(htmltree, self.expr_tags)
total = []
for i in result:
total.append(i.replace("\n","").replace("\t",""))
return total
def getCover(self, htmltree):
try:
result = super().getCover(htmltree)
return 'https:' + result
except:
return ''
def getDirector(self, htmltree):
try:
result = super().getDirector(htmltree).replace(u'\n','').replace(u'\t', '')
return result
except:
return ''
def getOutline(self, htmltree):
from .storyline import getStoryline
return getStoryline(self.number, uncensored=False)
def getActors(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = {i.text.strip(): i['href'] for i in htmla}
o = {}
for k, v in t.items():
r = self.browser.open_relative(v)
if not r.ok:
continue
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
if 'noimage.gif' in pic['src']:
continue
o[k] = urljoin(self.browser.url, pic['src'])
return o
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
html = html_pather.search(self.detail_page)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
def open_by_browser(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def search(self, number, core: None):
self.number = number
self.updateCore(core)
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
result = self.dictformat(lx)
return result