diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index c15bb48..b761435 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -474,18 +474,11 @@ def main(): check_update(version) # Download Mapping Table, parallel version - down_map_tab = [] - actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml' - if not actor_xml.exists(): - down_map_tab.append(( - "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml", - actor_xml)) - info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml' - if not info_xml.exists(): - down_map_tab.append(( - "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml", - info_xml)) - res = parallel_download_files(down_map_tab) + def fmd(f): + return ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/' + f, + Path.home() / '.local' / 'share' / 'avdc' / f) + map_tab = (fmd('mapping_actor.xml'), fmd('mapping_info.xml'), fmd('c_number.json')) + res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists())) for i, fp in enumerate(res, start=1): if fp and len(fp): print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}") diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 8b73b83..3699aa4 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -57,8 +57,8 @@ def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result def getTag(html): - result = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return result + x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return [i.strip() for i in x[2:]] if len(x) > 2 else [] def getSeries(html): try: result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index 27bc1a0..c559c8d 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -14,7 +14,7 @@ def getTitle_fc2com(htmlcode): #获取厂商 return result def getActor_fc2com(htmlcode): try: - htmtml = etree.fromstring(htmlcode, etree.HTMLParser()) + html = etree.fromstring(htmlcode, etree.HTMLParser()) result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] return result except: diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 09dc045..46493da 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -72,7 +72,7 @@ def getSerise(html): #获取系列 return str(x[0]) if len(x) else '' def getTag(html): # 获取标签 klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return klist + return klist[1:] def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'
[\s\S]*?
\s*?') html = html_pather.search(htmlcode) diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py index 7af0c14..538fc19 100644 --- a/WebCrawler/javlib.py +++ b/WebCrawler/javlib.py @@ -34,10 +34,10 @@ def main(number: str): ) soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) - + fanhao_pather = re.compile(r'
(.*?)
') fanhao = fanhao_pather.findall(result.text) - + if "/?v=jav" in result.url: dic = { "title": get_title(lx, soup),