From b59b4938d6482472e8fe93dfdf30c01c7b775782 Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 22 Sep 2021 06:03:58 +0800 Subject: [PATCH 01/12] xcity.py: get detail page by form query --- ADC_function.py | 22 ++++++++++++++++++++++ WebCrawler/xcity.py | 16 +++++++++------- requirements.txt | 1 + 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 7374a60..b23cee2 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -11,6 +11,7 @@ from lxml import etree import re import config from urllib.parse import urljoin +import mechanicalsoup def getXpathSingle(htmlcode, xpath): @@ -83,6 +84,27 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) +def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + configProxy = config.Config().proxy() + if configProxy.enable: + browser.session.proxies = configProxy.proxies() + result = browser.open(url) + form = browser.select_form() if form_name is None else browser.select_form(form_name) + if isinstance(fields, dict): + for k, v in fields.items(): + browser[k] = v + response = browser.submit_selected() + response.encoding = "utf-8" + + if return_type == "object": + return response + elif return_type == "content": + return response.content + else: + return response.text + + # def get_javlib_cookie() -> [dict, str]: # import cloudscraper # switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 53981e5..ec872f5 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -82,7 +82,7 @@ def getYear(getRelease): def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] + result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) except: return '' try: @@ -171,12 +171,13 @@ def getExtrafanart(htmlcode): # 获取剧照 def main(number): try: - number = number.upper() - query_result = get_html( - 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') - html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] - detail_page = get_html('https://xcity.jp' + urls) + query_result = get_html_by_form('https://xcity.jp/about/', + fields = {'q' : number.replace('-','').lower()}) + html = etree.fromstring(query_result, etree.HTMLParser()) + urls = str(html.xpath('//table[@class="resultList"]/tr[2]/td[1]/a/@href')).strip(" ['']") + if not len(urls): + raise ValueError("xcity.py: urls not found") + detail_page = get_html(abs_url('https://xcity.jp', urls)) dic = { 'actor': getActor(detail_page), 'title': getTitle(detail_page), @@ -208,3 +209,4 @@ def main(number): if __name__ == '__main__': print(main('VNDS-2624')) + print(main('ABP-345')) diff --git a/requirements.txt b/requirements.txt index 8b6ab2d..89dc0af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ cloudscraper pysocks==1.7.1 urllib3==1.24.3 certifi==2020.12.5 +MechanicalSoup==1.1.0 From 446eb166a3339352d375ef9a4aa9be15f21da72e Mon Sep 17 00:00:00 2001 From: lededev Date: Wed, 22 Sep 2021 06:13:20 +0800 Subject: [PATCH 02/12] requests-2.26.0 need by MechanicalSoup-1.1.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 89dc0af..c7e86bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests==2.20.0 +requests==2.26.0 pyquery lxml beautifulsoup4 From d6677b717d6eea896740295a18ff334b53a8f263 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 03:56:55 +0800 Subject: [PATCH 03/12] check add cookies --- ADC_function.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ADC_function.py b/ADC_function.py index b23cee2..3112bae 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -86,17 +86,20 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + if isinstance(cookies, dict): + requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) configProxy = config.Config().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() result = browser.open(url) + if not result.ok: + return '' form = browser.select_form() if form_name is None else browser.select_form(form_name) if isinstance(fields, dict): for k, v in fields.items(): browser[k] = v response = browser.submit_selected() response.encoding = "utf-8" - if return_type == "object": return response elif return_type == "content": From 367d53b09b9a0b34ae2e3a796926853be21add04 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 05:10:43 +0800 Subject: [PATCH 04/12] add param allow return stateful browser for follow_link() --- ADC_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ADC_function.py b/ADC_function.py index 3112bae..8029bab 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -104,6 +104,8 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d return response elif return_type == "content": return response.content + elif return_type == "browser": + return response, browser else: return response.text From c32a4a12accfdf8c7f8142399391c8762e23aa6b Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 07:01:24 +0800 Subject: [PATCH 05/12] speed up by reusing stateful browser --- WebCrawler/xcity.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index ec872f5..285b5d4 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -171,13 +171,16 @@ def getExtrafanart(htmlcode): # 获取剧照 def main(number): try: - query_result = get_html_by_form('https://xcity.jp/about/', - fields = {'q' : number.replace('-','').lower()}) - html = etree.fromstring(query_result, etree.HTMLParser()) - urls = str(html.xpath('//table[@class="resultList"]/tr[2]/td[1]/a/@href')).strip(" ['']") - if not len(urls): - raise ValueError("xcity.py: urls not found") - detail_page = get_html(abs_url('https://xcity.jp', urls)) + query_result, browser = get_html_by_form( + 'https://xcity.jp/about/', + fields = {'q' : number.replace('-','').lower()}, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("xcity.py: page not found") + result = browser.follow_link(browser.links('avod\/detail')[0]) + if not result.ok: + raise ValueError("xcity.py: detail page not found") + detail_page = str(browser.page) dic = { 'actor': getActor(detail_page), 'title': getTitle(detail_page), @@ -195,7 +198,7 @@ def main(number): 'label': getLabel(detail_page), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), 'actor_photo': getActorPhoto(getActor(detail_page)), - 'website': 'https://xcity.jp' + urls, + 'website': browser.url, 'source': 'xcity.py', 'series': getSeries(detail_page), } From c599463409d34f6e04f46bc86afadbd6fc9711f6 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 07:58:53 +0800 Subject: [PATCH 06/12] rewrite getActorPhoto() to get real photo --- WebCrawler/xcity.py | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 285b5d4..fc0e3bc 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -17,19 +17,30 @@ def getTitle(a): return result -def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0] - return result1 +def getActor(browser): + htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') + t = [] + for i in htmla: + t.append(i.text.strip()) + return t -def getActorPhoto(actor): # //*[@id="star_qdt"]/li/a/img - a = actor.split(',') - d = {} - for i in a: - p = {i: ''} - d.update(p) - return d +def getActorPhoto(browser): + htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') + t = {} + for i in htmla: + p = {i.text.strip(): i['href']} + t.update(p) + o = {} + for k, v in t.items(): + r = browser.open_relative(v) + if r.ok: + pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') + p = {k: abs_url(browser.url, pic['src'])} + else: + p = {k, ''} + o.update(p) + return o def getStudio(a): @@ -181,8 +192,9 @@ def main(number): if not result.ok: raise ValueError("xcity.py: detail page not found") detail_page = str(browser.page) + url = browser.url dic = { - 'actor': getActor(detail_page), + 'actor': getActor(browser), 'title': getTitle(detail_page), 'studio': getStudio(detail_page), 'outline': getOutline(detail_page), @@ -197,8 +209,8 @@ def main(number): 'tag': getTag(detail_page), 'label': getLabel(detail_page), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(getActor(detail_page)), - 'website': browser.url, + 'actor_photo': getActorPhoto(browser), + 'website': url, 'source': 'xcity.py', 'series': getSeries(detail_page), } @@ -211,5 +223,6 @@ def main(number): return js if __name__ == '__main__': - print(main('VNDS-2624')) - print(main('ABP-345')) + print(main('RCTD-288')) + #print(main('VNDS-2624')) + #print(main('ABP-345')) From 54ed626294d89b9ddd7f66e4089b55f58fad33b0 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 08:21:01 +0800 Subject: [PATCH 07/12] remove abs_url(), just urljoin() is enough --- ADC_function.py | 6 ------ WebCrawler/airav.py | 4 ++-- WebCrawler/javbus.py | 4 ++-- WebCrawler/xcity.py | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 8029bab..de56eb0 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -593,9 +593,3 @@ def is_link(filename: str): return True # hard link Linux MAC OSX Windows NTFS return False -# URL相对路径转绝对路径 -def abs_url(base_url: str, href: str) -> str: - if href.startswith('http'): - return href - return urljoin(base_url, href) - diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 87efcc8..5925421 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -25,7 +25,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=abs_url("https://www.javbus.com", + p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) @@ -60,7 +60,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - return abs_url("https://www.javbus.com", image.attr('href')) + return urljoin("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index d378e0e..7446ef3 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -17,7 +17,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=abs_url("https://www.javbus.com", + p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) @@ -48,7 +48,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - return abs_url("https://www.javbus.com", image.attr('href')) + return urljoin("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index fc0e3bc..884b366 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -36,7 +36,7 @@ def getActorPhoto(browser): r = browser.open_relative(v) if r.ok: pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') - p = {k: abs_url(browser.url, pic['src'])} + p = {k: urljoin(browser.url, pic['src'])} else: p = {k, ''} o.update(p) From 5e0e8b9cea08f4eb1e2778a6be016a4f7c2e803e Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 15:43:00 +0800 Subject: [PATCH 08/12] WebCrawler site list in default config.ini larger than 60 --- WebCrawler/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index 0bac971..13a4145 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -55,7 +55,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 # default fetch order list, from the beginning to the end sources = conf.sources().split(',') - if not len(conf.sources()) > 60: + if not len(conf.sources()) > 80: # if the input file name matches certain rules, # move some web service to the beginning of the list lo_file_number = file_number.lower() From 50574a705b9fb33b4446d99241e4d443a2129b25 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 23 Sep 2021 15:45:00 +0800 Subject: [PATCH 09/12] carib.py: add outline/series/actor_photo --- ADC_function.py | 20 +++++++++++++- WebCrawler/carib.py | 63 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 13 deletions(-) mode change 100644 => 100755 ADC_function.py diff --git a/ADC_function.py b/ADC_function.py old mode 100644 new mode 100755 index de56eb0..7a23a52 --- a/ADC_function.py +++ b/ADC_function.py @@ -84,6 +84,25 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) +def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + configProxy = config.Config().proxy() + if configProxy.enable: + browser.session.proxies = configProxy.proxies() + result = browser.open(url) + if not result.ok: + return '' + result.encoding = "utf-8" + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "browser": + return result, browser + else: + return result.text + + def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) if isinstance(cookies, dict): @@ -592,4 +611,3 @@ def is_link(filename: str): elif os.stat(filename).st_nlink > 1: return True # hard link Linux MAC OSX Windows NTFS return False - diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index b173255..f4fa9c0 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -8,15 +8,16 @@ from ADC_function import * def main(number: str) -> json: try: - caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', - return_type="content") + caribbytes, browser = get_html_by_browser( + 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', + return_type="browser") - caribhtml = caribbytes.decode("euc_jp") + if not caribbytes or not caribbytes.ok: + raise ValueError("page not found") - soup = BeautifulSoup(caribhtml, "html.parser") - lx = html.fromstring(str(soup)) + lx = html.fromstring(str(browser.page)) - if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): + if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): raise ValueError("page info not found") except Exception as e: if config.Config().debug(): @@ -27,7 +28,7 @@ def main(number: str) -> json: 'title': get_title(lx), 'studio': '加勒比', 'year': get_year(lx), - 'outline': '', + 'outline': get_outline(lx), 'runtime': get_runtime(lx), 'director': '', 'actor': get_actor(lx), @@ -36,12 +37,12 @@ def main(number: str) -> json: 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', 'tag': get_tag(lx), 'extrafanart': get_extrafanart(lx), - 'label': '', - 'imagecut': 0, - 'actor_photo': '', + 'label': get_series(lx), + 'imagecut': 1, + 'actor_photo': get_actor_photo(browser), 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', 'source': 'carib.py', - 'series': '', + 'series': get_series(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) return js @@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] +def get_outline(lx: html.HtmlElement) -> str: + return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() + def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') -def get_actor(lx: html.HtmlElement) -> str: +def get_actor(lx: html.HtmlElement): r = [] actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()") for act in actors: @@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str: r.append('https://www.caribbeancom.com' + jpg) return r +def get_series(lx: html.HtmlElement) -> str: + try: + return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip() + except: + return '' + def get_runtime(lx: html.HtmlElement) -> str: return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() +def get_actor_photo(browser): + htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a') + t = {} + for a in htmla: + if a.text.strip() == '他': + continue + p = {a.text.strip(): a['href']} + t.update(p) + o = {} + for k, v in t.items(): + if '/search_act/' not in v: + continue + r = browser.open_relative(v) + if not r.ok: + continue + html = browser.page.prettify() + pos = html.find('.full-bg') + if pos<0: + continue + css = html[pos:pos+100] + p0 = css.find('background: url(') + p1 = css.find('.jpg)') + if p0<0 or p1<0: + continue + p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} + o.update(p) + return o + if __name__ == "__main__": + print(main("070116-197")) # actor have photo print(main("041721-001")) print(main("080520-001")) From 6c990e84820aeb38ff646eb3f55b1f8646b68ea6 Mon Sep 17 00:00:00 2001 From: lededev Date: Sat, 25 Sep 2021 06:45:08 +0800 Subject: [PATCH 10/12] xcity.py: Mode 3 requires the file name to remain unchanged --- WebCrawler/xcity.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 884b366..6cd9325 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -182,9 +182,10 @@ def getExtrafanart(htmlcode): # 获取剧照 def main(number): try: + xcity_number = number.replace('-','') query_result, browser = get_html_by_form( 'https://xcity.jp/about/', - fields = {'q' : number.replace('-','').lower()}, + fields = {'q' : xcity_number.lower()}, return_type = 'browser') if not query_result or not query_result.ok: raise ValueError("xcity.py: page not found") @@ -193,6 +194,10 @@ def main(number): raise ValueError("xcity.py: detail page not found") detail_page = str(browser.page) url = browser.url + newnum = getNum(detail_page).upper() + number_up = number.upper() + if newnum != number_up and newnum == xcity_number.upper(): + newnum = number_up dic = { 'actor': getActor(browser), 'title': getTitle(detail_page), @@ -201,7 +206,7 @@ def main(number): 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), - 'number': getNum(detail_page), + 'number': newnum, 'cover': getCover(detail_page), 'cover_small': '', 'extrafanart': getExtrafanart(detail_page), From 43bb64d7d08f75ea93d9873886e6999d6a013106 Mon Sep 17 00:00:00 2001 From: lededev Date: Sat, 25 Sep 2021 06:53:40 +0800 Subject: [PATCH 11/12] xcity.py: Strictly limit the number --- WebCrawler/xcity.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 6cd9325..f531470 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -196,8 +196,11 @@ def main(number): url = browser.url newnum = getNum(detail_page).upper() number_up = number.upper() - if newnum != number_up and newnum == xcity_number.upper(): - newnum = number_up + if newnum != number_up: + if newnum == xcity_number.upper(): + newnum = number_up + else: + raise ValueError("xcity.py: number not found") dic = { 'actor': getActor(browser), 'title': getTitle(detail_page), From 4ffc34a5cfa85122386588ed95b11a99434664b6 Mon Sep 17 00:00:00 2001 From: lededev Date: Sat, 25 Sep 2021 20:54:07 +0800 Subject: [PATCH 12/12] xcity on top when number similar ABP321 --- WebCrawler/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index 13a4145..43680ea 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -82,6 +82,11 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 "rj" in lo_file_number or "vj" in lo_file_number ): sources.insert(0, sources.pop(sources.index("dlsite"))) + elif re.match(r"^[a-z0-9]{3,}$", lo_file_number): + if "javdb" in sources: + sources.insert(0, sources.pop(sources.index("javdb"))) + if "xcity" in sources: + sources.insert(0, sources.pop(sources.index("xcity"))) # check sources in func_mapping todel = []