Merge pull request #591 from lededev/xcity-f1
xcity.py: get detail page by form query
This commit is contained in:
@@ -55,7 +55,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
|
||||
|
||||
# default fetch order list, from the beginning to the end
|
||||
sources = conf.sources().split(',')
|
||||
if not len(conf.sources()) > 60:
|
||||
if not len(conf.sources()) > 80:
|
||||
# if the input file name matches certain rules,
|
||||
# move some web service to the beginning of the list
|
||||
lo_file_number = file_number.lower()
|
||||
@@ -82,6 +82,11 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
|
||||
"rj" in lo_file_number or "vj" in lo_file_number
|
||||
):
|
||||
sources.insert(0, sources.pop(sources.index("dlsite")))
|
||||
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
|
||||
if "javdb" in sources:
|
||||
sources.insert(0, sources.pop(sources.index("javdb")))
|
||||
if "xcity" in sources:
|
||||
sources.insert(0, sources.pop(sources.index("xcity")))
|
||||
|
||||
# check sources in func_mapping
|
||||
todel = []
|
||||
|
||||
@@ -25,7 +25,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||
l=i.a['href']
|
||||
t=i.get_text()
|
||||
html = etree.fromstring(get_html(l), etree.HTMLParser())
|
||||
p=abs_url("https://www.javbus.com",
|
||||
p=urljoin("https://www.javbus.com",
|
||||
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
|
||||
p2={t:p}
|
||||
d.update(p2)
|
||||
@@ -60,7 +60,7 @@ def getYear(htmlcode): #获取年份
|
||||
def getCover(htmlcode): #获取封面链接
|
||||
doc = pq(htmlcode)
|
||||
image = doc('a.bigImage')
|
||||
return abs_url("https://www.javbus.com", image.attr('href'))
|
||||
return urljoin("https://www.javbus.com", image.attr('href'))
|
||||
def getRelease(htmlcode): #获取出版日期
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
||||
|
||||
@@ -8,15 +8,16 @@ from ADC_function import *
|
||||
|
||||
def main(number: str) -> json:
|
||||
try:
|
||||
caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||||
return_type="content")
|
||||
caribbytes, browser = get_html_by_browser(
|
||||
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||||
return_type="browser")
|
||||
|
||||
caribhtml = caribbytes.decode("euc_jp")
|
||||
if not caribbytes or not caribbytes.ok:
|
||||
raise ValueError("page not found")
|
||||
|
||||
soup = BeautifulSoup(caribhtml, "html.parser")
|
||||
lx = html.fromstring(str(soup))
|
||||
lx = html.fromstring(str(browser.page))
|
||||
|
||||
if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
||||
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
||||
raise ValueError("page info not found")
|
||||
except Exception as e:
|
||||
if config.Config().debug():
|
||||
@@ -27,7 +28,7 @@ def main(number: str) -> json:
|
||||
'title': get_title(lx),
|
||||
'studio': '加勒比',
|
||||
'year': get_year(lx),
|
||||
'outline': '',
|
||||
'outline': get_outline(lx),
|
||||
'runtime': get_runtime(lx),
|
||||
'director': '',
|
||||
'actor': get_actor(lx),
|
||||
@@ -36,12 +37,12 @@ def main(number: str) -> json:
|
||||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
||||
'tag': get_tag(lx),
|
||||
'extrafanart': get_extrafanart(lx),
|
||||
'label': '',
|
||||
'imagecut': 0,
|
||||
'actor_photo': '',
|
||||
'label': get_series(lx),
|
||||
'imagecut': 1,
|
||||
'actor_photo': get_actor_photo(browser),
|
||||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
||||
'source': 'carib.py',
|
||||
'series': '',
|
||||
'series': get_series(lx),
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||
return js
|
||||
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
|
||||
def get_year(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
||||
|
||||
def get_outline(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
||||
|
||||
def get_release(lx: html.HtmlElement) -> str:
|
||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
||||
|
||||
def get_actor(lx: html.HtmlElement) -> str:
|
||||
def get_actor(lx: html.HtmlElement):
|
||||
r = []
|
||||
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
|
||||
for act in actors:
|
||||
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
|
||||
r.append('https://www.caribbeancom.com' + jpg)
|
||||
return r
|
||||
|
||||
def get_series(lx: html.HtmlElement) -> str:
|
||||
try:
|
||||
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
def get_runtime(lx: html.HtmlElement) -> str:
|
||||
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
||||
|
||||
def get_actor_photo(browser):
|
||||
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
|
||||
t = {}
|
||||
for a in htmla:
|
||||
if a.text.strip() == '他':
|
||||
continue
|
||||
p = {a.text.strip(): a['href']}
|
||||
t.update(p)
|
||||
o = {}
|
||||
for k, v in t.items():
|
||||
if '/search_act/' not in v:
|
||||
continue
|
||||
r = browser.open_relative(v)
|
||||
if not r.ok:
|
||||
continue
|
||||
html = browser.page.prettify()
|
||||
pos = html.find('.full-bg')
|
||||
if pos<0:
|
||||
continue
|
||||
css = html[pos:pos+100]
|
||||
p0 = css.find('background: url(')
|
||||
p1 = css.find('.jpg)')
|
||||
if p0<0 or p1<0:
|
||||
continue
|
||||
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
|
||||
o.update(p)
|
||||
return o
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(main("070116-197")) # actor have photo
|
||||
print(main("041721-001"))
|
||||
print(main("080520-001"))
|
||||
|
||||
@@ -17,7 +17,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
|
||||
l=i.a['href']
|
||||
t=i.get_text()
|
||||
html = etree.fromstring(get_html(l), etree.HTMLParser())
|
||||
p=abs_url("https://www.javbus.com",
|
||||
p=urljoin("https://www.javbus.com",
|
||||
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
|
||||
p2={t:p}
|
||||
d.update(p2)
|
||||
@@ -48,7 +48,7 @@ def getYear(htmlcode): #获取年份
|
||||
def getCover(htmlcode): #获取封面链接
|
||||
doc = pq(htmlcode)
|
||||
image = doc('a.bigImage')
|
||||
return abs_url("https://www.javbus.com", image.attr('href'))
|
||||
return urljoin("https://www.javbus.com", image.attr('href'))
|
||||
def getRelease(htmlcode): #获取出版日期
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
||||
|
||||
@@ -17,19 +17,30 @@ def getTitle(a):
|
||||
return result
|
||||
|
||||
|
||||
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
|
||||
return result1
|
||||
def getActor(browser):
|
||||
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||
t = []
|
||||
for i in htmla:
|
||||
t.append(i.text.strip())
|
||||
return t
|
||||
|
||||
|
||||
def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img
|
||||
a = actor.split(',')
|
||||
d = {}
|
||||
for i in a:
|
||||
p = {i: ''}
|
||||
d.update(p)
|
||||
return d
|
||||
def getActorPhoto(browser):
|
||||
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||
t = {}
|
||||
for i in htmla:
|
||||
p = {i.text.strip(): i['href']}
|
||||
t.update(p)
|
||||
o = {}
|
||||
for k, v in t.items():
|
||||
r = browser.open_relative(v)
|
||||
if r.ok:
|
||||
pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
|
||||
p = {k: urljoin(browser.url, pic['src'])}
|
||||
else:
|
||||
p = {k, ''}
|
||||
o.update(p)
|
||||
return o
|
||||
|
||||
|
||||
def getStudio(a):
|
||||
@@ -82,7 +93,7 @@ def getYear(getRelease):
|
||||
def getRelease(a):
|
||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
try:
|
||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
|
||||
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
|
||||
except:
|
||||
return ''
|
||||
try:
|
||||
@@ -171,21 +182,34 @@ def getExtrafanart(htmlcode): # 获取剧照
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
number = number.upper()
|
||||
query_result = get_html(
|
||||
'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
|
||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
|
||||
detail_page = get_html('https://xcity.jp' + urls)
|
||||
xcity_number = number.replace('-','')
|
||||
query_result, browser = get_html_by_form(
|
||||
'https://xcity.jp/about/',
|
||||
fields = {'q' : xcity_number.lower()},
|
||||
return_type = 'browser')
|
||||
if not query_result or not query_result.ok:
|
||||
raise ValueError("xcity.py: page not found")
|
||||
result = browser.follow_link(browser.links('avod\/detail')[0])
|
||||
if not result.ok:
|
||||
raise ValueError("xcity.py: detail page not found")
|
||||
detail_page = str(browser.page)
|
||||
url = browser.url
|
||||
newnum = getNum(detail_page).upper()
|
||||
number_up = number.upper()
|
||||
if newnum != number_up:
|
||||
if newnum == xcity_number.upper():
|
||||
newnum = number_up
|
||||
else:
|
||||
raise ValueError("xcity.py: number not found")
|
||||
dic = {
|
||||
'actor': getActor(detail_page),
|
||||
'actor': getActor(browser),
|
||||
'title': getTitle(detail_page),
|
||||
'studio': getStudio(detail_page),
|
||||
'outline': getOutline(detail_page),
|
||||
'runtime': getRuntime(detail_page),
|
||||
'director': getDirector(detail_page),
|
||||
'release': getRelease(detail_page),
|
||||
'number': getNum(detail_page),
|
||||
'number': newnum,
|
||||
'cover': getCover(detail_page),
|
||||
'cover_small': '',
|
||||
'extrafanart': getExtrafanart(detail_page),
|
||||
@@ -193,8 +217,8 @@ def main(number):
|
||||
'tag': getTag(detail_page),
|
||||
'label': getLabel(detail_page),
|
||||
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||
'actor_photo': getActorPhoto(getActor(detail_page)),
|
||||
'website': 'https://xcity.jp' + urls,
|
||||
'actor_photo': getActorPhoto(browser),
|
||||
'website': url,
|
||||
'source': 'xcity.py',
|
||||
'series': getSeries(detail_page),
|
||||
}
|
||||
@@ -207,4 +231,6 @@ def main(number):
|
||||
return js
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('VNDS-2624'))
|
||||
print(main('RCTD-288'))
|
||||
#print(main('VNDS-2624'))
|
||||
#print(main('ABP-345'))
|
||||
|
||||
Reference in New Issue
Block a user