diff --git a/ADC_function.py b/ADC_function.py index 7374a60..b23cee2 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -11,6 +11,7 @@ from lxml import etree import re import config from urllib.parse import urljoin +import mechanicalsoup def getXpathSingle(htmlcode, xpath): @@ -83,6 +84,27 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) +def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + configProxy = config.Config().proxy() + if configProxy.enable: + browser.session.proxies = configProxy.proxies() + result = browser.open(url) + form = browser.select_form() if form_name is None else browser.select_form(form_name) + if isinstance(fields, dict): + for k, v in fields.items(): + browser[k] = v + response = browser.submit_selected() + response.encoding = "utf-8" + + if return_type == "object": + return response + elif return_type == "content": + return response.content + else: + return response.text + + # def get_javlib_cookie() -> [dict, str]: # import cloudscraper # switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 53981e5..ec872f5 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -82,7 +82,7 @@ def getYear(getRelease): def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] + result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) except: return '' try: @@ -171,12 +171,13 @@ def getExtrafanart(htmlcode): # 获取剧照 def main(number): try: - number = number.upper() - query_result = get_html( - 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] - detail_page = get_html('https://xcity.jp' + urls) + query_result = get_html_by_form('https://xcity.jp/about/', + fields = {'q' : number.replace('-','').lower()}) + html = etree.fromstring(query_result, etree.HTMLParser()) + urls = str(html.xpath('//table[@class="resultList"]/tr[2]/td[1]/a/@href')).strip(" ['']") + if not len(urls): + raise ValueError("xcity.py: urls not found") + detail_page = get_html(abs_url('https://xcity.jp', urls)) dic = { 'actor': getActor(detail_page), 'title': getTitle(detail_page), @@ -208,3 +209,4 @@ def main(number): if __name__ == '__main__': print(main('VNDS-2624')) + print(main('ABP-345')) diff --git a/requirements.txt b/requirements.txt index 8b6ab2d..89dc0af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ cloudscraper pysocks==1.7.1 urllib3==1.24.3 certifi==2020.12.5 +MechanicalSoup==1.1.0