update scrapinglib
- support specifiedUrl when scraping single movie - support javlibrary and rating
This commit is contained in:
@@ -13,6 +13,9 @@ class Xcity(Parser):
|
||||
|
||||
expr_number = '//*[@id="hinban"]/text()'
|
||||
expr_title = '//*[@id="program_detail_title"]/text()'
|
||||
expr_actor = '//ul/li[@class="credit-links"]/a/text()'
|
||||
expr_actor_link = '//ul/li[@class="credit-links"]/a'
|
||||
expr_actorphoto = '//div[@class="frame"]/div/p/img/@src'
|
||||
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
|
||||
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
|
||||
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
|
||||
@@ -23,6 +26,7 @@ class Xcity(Parser):
|
||||
expr_director = '//*[@id="program_detail_director"]/text()'
|
||||
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
|
||||
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
|
||||
expr_extrafanart = '//div[@id="sample_images"]/div/a/@href'
|
||||
|
||||
def getStudio(self, htmltree):
|
||||
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
||||
@@ -57,41 +61,29 @@ class Xcity(Parser):
|
||||
return getStoryline(self.number, uncensored=False)
|
||||
return ''
|
||||
|
||||
def getActors(self, htmltree):
|
||||
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||
t = []
|
||||
for i in htmla:
|
||||
t.append(i.text.strip())
|
||||
return t
|
||||
|
||||
def getActorPhoto(self, htmltree):
|
||||
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||
t = {i.text.strip(): i['href'] for i in htmla}
|
||||
treea = self.getTreeAll(htmltree, self.expr_actor_link)
|
||||
t = {i.text.strip(): i.attrib['href'] for i in treea}
|
||||
o = {}
|
||||
for k, v in t.items():
|
||||
r = self.browser.open_relative(v)
|
||||
if not r.ok:
|
||||
continue
|
||||
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
|
||||
if 'noimage.gif' in pic['src']:
|
||||
continue
|
||||
o[k] = urljoin(self.browser.url, pic['src'])
|
||||
actorpageUrl = "https://xcity.jp" + v
|
||||
try:
|
||||
adtree = self.getHtmlTree(actorpageUrl)
|
||||
picUrl = self.getTreeElement(adtree, self.expr_actorphoto)
|
||||
if 'noimage.gif' in picUrl:
|
||||
continue
|
||||
o[k] = urljoin("https://xcity.jp", picUrl)
|
||||
except:
|
||||
pass
|
||||
return o
|
||||
|
||||
def getExtrafanart(self, htmltree):
|
||||
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
|
||||
html = html_pather.search(self.detail_page)
|
||||
if html:
|
||||
html = html.group()
|
||||
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
|
||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||
if extrafanart_imgs:
|
||||
s = []
|
||||
for urli in extrafanart_imgs:
|
||||
urli = 'https:' + urli.replace('/scene/small', '')
|
||||
s.append(urli)
|
||||
return s
|
||||
return ''
|
||||
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||
extrafanart = []
|
||||
for i in arts:
|
||||
i = "https:" + i
|
||||
extrafanart.append(i)
|
||||
return extrafanart
|
||||
|
||||
def open_by_browser(self, number):
|
||||
xcity_number = number.replace('-','')
|
||||
@@ -108,8 +100,12 @@ class Xcity(Parser):
|
||||
|
||||
def search(self, number):
|
||||
self.number = number
|
||||
self.detail_page, self.browser = self.open_by_browser(number)
|
||||
self.detailurl = self.browser.url
|
||||
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
|
||||
if self.specifiedUrl:
|
||||
self.detailurl = self.specifiedUrl
|
||||
lx = self.getHtmlTree(self.detailurl)
|
||||
else:
|
||||
self.detail_page, self.browser = self.open_by_browser(number)
|
||||
self.detailurl = self.browser.url
|
||||
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
|
||||
result = self.dictformat(lx)
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user