xcity.py: get detail page by form query

2021-09-22 06:03:58 +08:00
parent ffd80ba0e4
commit b59b4938d6
3 changed files with 32 additions and 7 deletions
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -11,6 +11,7 @@ from lxml import etree
 import re
 import config
 from urllib.parse import urljoin
+import mechanicalsoup


 def getXpathSingle(htmlcode, xpath):
@@ -83,6 +84,27 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
    print("[-]" + errors)


+def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
+    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
+    configProxy = config.Config().proxy()
+    if configProxy.enable:
+        browser.session.proxies = configProxy.proxies()
+    result = browser.open(url)
+    form = browser.select_form() if form_name is None else browser.select_form(form_name)
+    if isinstance(fields, dict):
+        for k, v in fields.items():
+            browser[k] = v
+    response = browser.submit_selected()
+    response.encoding = "utf-8"
+
+    if return_type == "object":
+        return response
+    elif return_type == "content":
+        return response.content
+    else:
+        return response.text
+
+
 # def get_javlib_cookie() -> [dict, str]:
 #     import cloudscraper
 #     switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -82,7 +82,7 @@ def getYear(getRelease):
 def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
-        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
+        result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
    except:
        return ''
    try:
@@ -171,12 +171,13 @@ def getExtrafanart(htmlcode):  # 获取剧照

 def main(number):
    try:
-        number = number.upper()
-        query_result = get_html(
-            'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30')
-        html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-        urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0]
-        detail_page = get_html('https://xcity.jp' + urls)
+        query_result = get_html_by_form('https://xcity.jp/about/',
+                                fields = {'q' : number.replace('-','').lower()})
+        html = etree.fromstring(query_result, etree.HTMLParser())
+        urls = str(html.xpath('//table[@class="resultList"]/tr[2]/td[1]/a/@href')).strip(" ['']")
+        if not len(urls):
+            raise ValueError("xcity.py: urls not found")
+        detail_page = get_html(abs_url('https://xcity.jp', urls))
        dic = {
            'actor': getActor(detail_page),
            'title': getTitle(detail_page),
@@ -208,3 +209,4 @@ def main(number):

 if __name__ == '__main__':
    print(main('VNDS-2624'))
+    print(main('ABP-345'))
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ cloudscraper
 pysocks==1.7.1
 urllib3==1.24.3
 certifi==2020.12.5
+MechanicalSoup==1.1.0