Merge pull request #591 from lededev/xcity-f1

xcity.py: get detail page by form query
This commit is contained in:
Yoshiko2
2021-09-27 22:00:43 +08:00
committed by GitHub
7 changed files with 158 additions and 48 deletions

View File

@@ -11,6 +11,7 @@ from lxml import etree
import re import re
import config import config
from urllib.parse import urljoin from urllib.parse import urljoin
import mechanicalsoup
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
@@ -83,6 +84,51 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
print("[-]" + errors) print("[-]" + errors)
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
configProxy = config.Config().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
if not result.ok:
return ''
result.encoding = "utf-8"
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "browser":
return result, browser
else:
return result.text
def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
if isinstance(cookies, dict):
requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
configProxy = config.Config().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
if not result.ok:
return ''
form = browser.select_form() if form_name is None else browser.select_form(form_name)
if isinstance(fields, dict):
for k, v in fields.items():
browser[k] = v
response = browser.submit_selected()
response.encoding = "utf-8"
if return_type == "object":
return response
elif return_type == "content":
return response.content
elif return_type == "browser":
return response, browser
else:
return response.text
# def get_javlib_cookie() -> [dict, str]: # def get_javlib_cookie() -> [dict, str]:
# import cloudscraper # import cloudscraper
# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() # switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
@@ -568,10 +614,3 @@ def is_link(filename: str):
elif os.stat(filename).st_nlink > 1: elif os.stat(filename).st_nlink > 1:
return True # hard link Linux MAC OSX Windows NTFS return True # hard link Linux MAC OSX Windows NTFS
return False return False
# URL相对路径转绝对路径
def abs_url(base_url: str, href: str) -> str:
if href.startswith('http'):
return href
return urljoin(base_url, href)

View File

@@ -55,7 +55,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
# default fetch order list, from the beginning to the end # default fetch order list, from the beginning to the end
sources = conf.sources().split(',') sources = conf.sources().split(',')
if not len(conf.sources()) > 60: if not len(conf.sources()) > 80:
# if the input file name matches certain rules, # if the input file name matches certain rules,
# move some web service to the beginning of the list # move some web service to the beginning of the list
lo_file_number = file_number.lower() lo_file_number = file_number.lower()
@@ -82,6 +82,11 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
"rj" in lo_file_number or "vj" in lo_file_number "rj" in lo_file_number or "vj" in lo_file_number
): ):
sources.insert(0, sources.pop(sources.index("dlsite"))) sources.insert(0, sources.pop(sources.index("dlsite")))
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
if "javdb" in sources:
sources.insert(0, sources.pop(sources.index("javdb")))
if "xcity" in sources:
sources.insert(0, sources.pop(sources.index("xcity")))
# check sources in func_mapping # check sources in func_mapping
todel = [] todel = []

View File

@@ -25,7 +25,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=abs_url("https://www.javbus.com", p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
@@ -60,7 +60,7 @@ def getYear(htmlcode): #获取年份
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return abs_url("https://www.javbus.com", image.attr('href')) return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")

View File

@@ -8,15 +8,16 @@ from ADC_function import *
def main(number: str) -> json: def main(number: str) -> json:
try: try:
caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', caribbytes, browser = get_html_by_browser(
return_type="content") 'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="browser")
caribhtml = caribbytes.decode("euc_jp") if not caribbytes or not caribbytes.ok:
raise ValueError("page not found")
soup = BeautifulSoup(caribhtml, "html.parser") lx = html.fromstring(str(browser.page))
lx = html.fromstring(str(soup))
if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
raise ValueError("page info not found") raise ValueError("page info not found")
except Exception as e: except Exception as e:
if config.Config().debug(): if config.Config().debug():
@@ -27,7 +28,7 @@ def main(number: str) -> json:
'title': get_title(lx), 'title': get_title(lx),
'studio': '加勒比', 'studio': '加勒比',
'year': get_year(lx), 'year': get_year(lx),
'outline': '', 'outline': get_outline(lx),
'runtime': get_runtime(lx), 'runtime': get_runtime(lx),
'director': '', 'director': '',
'actor': get_actor(lx), 'actor': get_actor(lx),
@@ -36,12 +37,12 @@ def main(number: str) -> json:
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
'tag': get_tag(lx), 'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx), 'extrafanart': get_extrafanart(lx),
'label': '', 'label': get_series(lx),
'imagecut': 0, 'imagecut': 1,
'actor_photo': '', 'actor_photo': get_actor_photo(browser),
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
'source': 'carib.py', 'source': 'carib.py',
'series': '', 'series': get_series(lx),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js return js
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
def get_year(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
def get_release(lx: html.HtmlElement) -> str: def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
def get_actor(lx: html.HtmlElement) -> str: def get_actor(lx: html.HtmlElement):
r = [] r = []
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()") actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
for act in actors: for act in actors:
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
r.append('https://www.caribbeancom.com' + jpg) r.append('https://www.caribbeancom.com' + jpg)
return r return r
def get_series(lx: html.HtmlElement) -> str:
try:
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
except:
return ''
def get_runtime(lx: html.HtmlElement) -> str: def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(browser):
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
t = {}
for a in htmla:
if a.text.strip() == '':
continue
p = {a.text.strip(): a['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = browser.open_relative(v)
if not r.ok:
continue
html = browser.page.prettify()
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
p0 = css.find('background: url(')
p1 = css.find('.jpg)')
if p0<0 or p1<0:
continue
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
o.update(p)
return o
if __name__ == "__main__": if __name__ == "__main__":
print(main("070116-197")) # actor have photo
print(main("041721-001")) print(main("041721-001"))
print(main("080520-001")) print(main("080520-001"))

View File

@@ -17,7 +17,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=abs_url("https://www.javbus.com", p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
@@ -48,7 +48,7 @@ def getYear(htmlcode): #获取年份
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return abs_url("https://www.javbus.com", image.attr('href')) return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")

View File

@@ -17,19 +17,30 @@ def getTitle(a):
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(browser):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0] t = []
return result1 for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img def getActorPhoto(browser):
a = actor.split(',') htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
d = {} t = {}
for i in a: for i in htmla:
p = {i: ''} p = {i.text.strip(): i['href']}
d.update(p) t.update(p)
return d o = {}
for k, v in t.items():
r = browser.open_relative(v)
if r.ok:
pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
p = {k: urljoin(browser.url, pic['src'])}
else:
p = {k, ''}
o.update(p)
return o
def getStudio(a): def getStudio(a):
@@ -82,7 +93,7 @@ def getYear(getRelease):
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except: except:
return '' return ''
try: try:
@@ -171,21 +182,34 @@ def getExtrafanart(htmlcode): # 获取剧照
def main(number): def main(number):
try: try:
number = number.upper() xcity_number = number.replace('-','')
query_result = get_html( query_result, browser = get_html_by_form(
'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') 'https://xcity.jp/about/',
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() fields = {'q' : xcity_number.lower()},
urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] return_type = 'browser')
detail_page = get_html('https://xcity.jp' + urls) if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
detail_page = str(browser.page)
url = browser.url
newnum = getNum(detail_page).upper()
number_up = number.upper()
if newnum != number_up:
if newnum == xcity_number.upper():
newnum = number_up
else:
raise ValueError("xcity.py: number not found")
dic = { dic = {
'actor': getActor(detail_page), 'actor': getActor(browser),
'title': getTitle(detail_page), 'title': getTitle(detail_page),
'studio': getStudio(detail_page), 'studio': getStudio(detail_page),
'outline': getOutline(detail_page), 'outline': getOutline(detail_page),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(detail_page),
'director': getDirector(detail_page), 'director': getDirector(detail_page),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
'number': getNum(detail_page), 'number': newnum,
'cover': getCover(detail_page), 'cover': getCover(detail_page),
'cover_small': '', 'cover_small': '',
'extrafanart': getExtrafanart(detail_page), 'extrafanart': getExtrafanart(detail_page),
@@ -193,8 +217,8 @@ def main(number):
'tag': getTag(detail_page), 'tag': getTag(detail_page),
'label': getLabel(detail_page), 'label': getLabel(detail_page),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(getActor(detail_page)), 'actor_photo': getActorPhoto(browser),
'website': 'https://xcity.jp' + urls, 'website': url,
'source': 'xcity.py', 'source': 'xcity.py',
'series': getSeries(detail_page), 'series': getSeries(detail_page),
} }
@@ -207,4 +231,6 @@ def main(number):
return js return js
if __name__ == '__main__': if __name__ == '__main__':
print(main('VNDS-2624')) print(main('RCTD-288'))
#print(main('VNDS-2624'))
#print(main('ABP-345'))

View File

@@ -1,4 +1,4 @@
requests==2.20.0 requests==2.26.0
pyquery pyquery
lxml lxml
beautifulsoup4 beautifulsoup4
@@ -7,3 +7,4 @@ cloudscraper
pysocks==1.7.1 pysocks==1.7.1
urllib3==1.24.3 urllib3==1.24.3
certifi==2020.12.5 certifi==2020.12.5
MechanicalSoup==1.1.0