Merge pull request #591 from lededev/xcity-f1

xcity.py: get detail page by form query
2021-09-27 22:00:43 +08:00
parent 875c5dc3a1 4ffc34a5cf
commit 30bc6a59c6
7 changed files with 158 additions and 48 deletions
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -11,6 +11,7 @@ from lxml import etree
 import re
 import config
 from urllib.parse import urljoin
 import mechanicalsoup
 def getXpathSingle(htmlcode, xpath):
@@ -83,6 +84,51 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
    print("[-]" + errors)
 def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
    configProxy = config.Config().proxy()
    if configProxy.enable:
        browser.session.proxies = configProxy.proxies()
    result = browser.open(url)
    if not result.ok:
        return ''
    result.encoding = "utf-8"
    if return_type == "object":
        return result
    elif return_type == "content":
        return result.content
    elif return_type == "browser":
        return result, browser
    else:
        return result.text
 def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
    if isinstance(cookies, dict):
        requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
    configProxy = config.Config().proxy()
    if configProxy.enable:
        browser.session.proxies = configProxy.proxies()
    result = browser.open(url)
    if not result.ok:
        return ''
    form = browser.select_form() if form_name is None else browser.select_form(form_name)
    if isinstance(fields, dict):
        for k, v in fields.items():
            browser[k] = v
    response = browser.submit_selected()
    response.encoding = "utf-8"
    if return_type == "object":
        return response
    elif return_type == "content":
        return response.content
    elif return_type == "browser":
        return response, browser
    else:
        return response.text
 # def get_javlib_cookie() -> [dict, str]:
 #     import cloudscraper
 #     switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
@@ -568,10 +614,3 @@ def is_link(filename: str):
    elif os.stat(filename).st_nlink > 1:
        return True # hard link Linux MAC OSX Windows NTFS
    return False
 # URL相对路径转绝对路径
 def abs_url(base_url: str, href: str) -> str:
    if href.startswith('http'):
        return href
    return urljoin(base_url, href)
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -55,7 +55,7 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
    # default fetch order list, from the beginning to the end
    sources = conf.sources().split(',')
-    if not len(conf.sources()) > 60:
+    if not len(conf.sources()) > 80:
        # if the input file name matches certain rules,
        # move some web service to the beginning of the list
        lo_file_number = file_number.lower()
@@ -82,6 +82,11 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
                "rj" in lo_file_number or "vj" in lo_file_number
        ):
            sources.insert(0, sources.pop(sources.index("dlsite")))
        elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
            if "javdb" in sources:
                sources.insert(0, sources.pop(sources.index("javdb")))
            if "xcity" in sources:
                sources.insert(0, sources.pop(sources.index("xcity")))
    # check sources in func_mapping
    todel = []
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -25,7 +25,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
        l=i.a['href']
        t=i.get_text()
        html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=abs_url("https://www.javbus.com",
+        p=urljoin("https://www.javbus.com",
                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
        p2={t:p}
        d.update(p2)
@@ -60,7 +60,7 @@ def getYear(htmlcode):   #获取年份
 def getCover(htmlcode):  #获取封面链接
    doc = pq(htmlcode)
    image = doc('a.bigImage')
-    return abs_url("https://www.javbus.com", image.attr('href'))
+    return urljoin("https://www.javbus.com", image.attr('href'))
 def getRelease(htmlcode): #获取出版日期
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -8,15 +8,16 @@ from ADC_function import *
 def main(number: str) -> json:
    try:
-        caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+        caribbytes, browser = get_html_by_browser(
-                             return_type="content")
+            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
            return_type="browser")
-        caribhtml = caribbytes.decode("euc_jp")
+        if not caribbytes or not caribbytes.ok:
            raise ValueError("page not found")
-        soup = BeautifulSoup(caribhtml, "html.parser")
+        lx = html.fromstring(str(browser.page))
        lx = html.fromstring(str(soup))
-        if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
+        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
    except Exception as e:
        if config.Config().debug():
@@ -27,7 +28,7 @@ def main(number: str) -> json:
        'title': get_title(lx),
        'studio': '加勒比',
        'year': get_year(lx),
-        'outline': '',
+        'outline': get_outline(lx),
        'runtime': get_runtime(lx),
        'director': '',
        'actor': get_actor(lx),
@@ -36,12 +37,12 @@ def main(number: str) -> json:
        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
        'tag': get_tag(lx),
        'extrafanart': get_extrafanart(lx),
-        'label': '',
+        'label': get_series(lx),
-        'imagecut': 0,
+        'imagecut': 1,
-        'actor_photo': '',
+        'actor_photo': get_actor_photo(browser),
        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
        'source': 'carib.py',
-        'series': '',
+        'series': get_series(lx),
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
    return js
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
 def get_year(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
 def get_outline(lx: html.HtmlElement) -> str:
    return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
 def get_release(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
-def get_actor(lx: html.HtmlElement) -> str:
+def get_actor(lx: html.HtmlElement):
    r = []
    actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
    for act in actors:
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
            r.append('https://www.caribbeancom.com' + jpg)
    return r
 def get_series(lx: html.HtmlElement) -> str:
    try:
        return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
    except:
        return ''
 def get_runtime(lx: html.HtmlElement) -> str:
    return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
 def get_actor_photo(browser):
    htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
    t = {}
    for a in htmla:
        if a.text.strip() == '他':
            continue
        p = {a.text.strip(): a['href']}
        t.update(p)
    o = {}
    for k, v in t.items():
        if '/search_act/' not in v:
            continue
        r = browser.open_relative(v)
        if not r.ok:
            continue
        html = browser.page.prettify()
        pos = html.find('.full-bg')
        if pos<0:
            continue
        css = html[pos:pos+100]
        p0 = css.find('background: url(')
        p1 = css.find('.jpg)')
        if p0<0 or p1<0:
            continue
        p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
        o.update(p)
    return o
 if __name__ == "__main__":
    print(main("070116-197")) # actor have photo
    print(main("041721-001"))
    print(main("080520-001"))
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -17,7 +17,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
        l=i.a['href']
        t=i.get_text()
        html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=abs_url("https://www.javbus.com",
+        p=urljoin("https://www.javbus.com",
                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
        p2={t:p}
        d.update(p2)
@@ -48,7 +48,7 @@ def getYear(htmlcode):   #获取年份
 def getCover(htmlcode):  #获取封面链接
    doc = pq(htmlcode)
    image = doc('a.bigImage')
-    return abs_url("https://www.javbus.com", image.attr('href'))
+    return urljoin("https://www.javbus.com", image.attr('href'))
 def getRelease(htmlcode): #获取出版日期
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -17,19 +17,30 @@ def getTitle(a):
    return result
-def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
+def getActor(browser):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
-    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
+    t = []
-    return result1
+    for i in htmla:
        t.append(i.text.strip())
    return t
-def getActorPhoto(actor):  # //*[@id="star_qdt"]/li/a/img
+def getActorPhoto(browser):
-    a = actor.split(',')
+    htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
-    d = {}
+    t = {}
-    for i in a:
+    for i in htmla:
-        p = {i: ''}
+        p = {i.text.strip(): i['href']}
-        d.update(p)
+        t.update(p)
-    return d
+    o = {}
    for k, v in t.items():
        r = browser.open_relative(v)
        if r.ok:
            pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
            p = {k: urljoin(browser.url, pic['src'])}
        else:
            p = {k, ''}
        o.update(p)
    return o
 def getStudio(a):
@@ -82,7 +93,7 @@ def getYear(getRelease):
 def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
-        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
+        result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
    except:
        return ''
    try:
@@ -171,21 +182,34 @@ def getExtrafanart(htmlcode):  # 获取剧照
 def main(number):
    try:
-        number = number.upper()
+        xcity_number = number.replace('-','')
-        query_result = get_html(
+        query_result, browser = get_html_by_form(
-            'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
+            'https://xcity.jp/about/',
-        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+            fields = {'q' : xcity_number.lower()},
-        urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
+            return_type = 'browser')
-        detail_page = get_html('https://xcity.jp' + urls)
+        if not query_result or not query_result.ok:
            raise ValueError("xcity.py: page not found")
        result = browser.follow_link(browser.links('avod\/detail')[0])
        if not result.ok:
            raise ValueError("xcity.py: detail page not found")
        detail_page = str(browser.page)
        url = browser.url
        newnum = getNum(detail_page).upper()
        number_up = number.upper()
        if newnum != number_up:
            if newnum == xcity_number.upper():
                newnum = number_up
            else:
                raise ValueError("xcity.py: number not found")
        dic = {
-            'actor': getActor(detail_page),
+            'actor': getActor(browser),
            'title': getTitle(detail_page),
            'studio': getStudio(detail_page),
            'outline': getOutline(detail_page),
            'runtime': getRuntime(detail_page),
            'director': getDirector(detail_page),
            'release': getRelease(detail_page),
-            'number': getNum(detail_page),
+            'number': newnum,
            'cover': getCover(detail_page),
            'cover_small': '',
            'extrafanart': getExtrafanart(detail_page),
@@ -193,8 +217,8 @@ def main(number):
            'tag': getTag(detail_page),
            'label': getLabel(detail_page),
            'year': getYear(getRelease(detail_page)),  # str(re.search('\d{4}',getRelease(a)).group()),
-            'actor_photo': getActorPhoto(getActor(detail_page)),
+            'actor_photo': getActorPhoto(browser),
-            'website': 'https://xcity.jp' + urls,
+            'website': url,
            'source': 'xcity.py',
            'series': getSeries(detail_page),
        }
@@ -207,4 +231,6 @@ def main(number):
    return js
 if __name__ == '__main__':
-    print(main('VNDS-2624'))
+    print(main('RCTD-288'))
    #print(main('VNDS-2624'))
    #print(main('ABP-345'))
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-requests==2.20.0
+requests==2.26.0
 pyquery
 lxml
 beautifulsoup4
@@ -7,3 +7,4 @@ cloudscraper
 pysocks==1.7.1
 urllib3==1.24.3
 certifi==2020.12.5
 MechanicalSoup==1.1.0