diff --git a/ADC_function.py b/ADC_function.py index 8029bab..de56eb0 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -593,9 +593,3 @@ def is_link(filename: str): return True # hard link Linux MAC OSX Windows NTFS return False -# URL相对路径转绝对路径 -def abs_url(base_url: str, href: str) -> str: - if href.startswith('http'): - return href - return urljoin(base_url, href) - diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 87efcc8..5925421 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -25,7 +25,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=abs_url("https://www.javbus.com", + p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) @@ -60,7 +60,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - return abs_url("https://www.javbus.com", image.attr('href')) + return urljoin("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index d378e0e..7446ef3 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -17,7 +17,7 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=abs_url("https://www.javbus.com", + p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) @@ -48,7 +48,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - return abs_url("https://www.javbus.com", image.attr('href')) + return urljoin("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index fc0e3bc..884b366 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -36,7 +36,7 @@ def getActorPhoto(browser): r = browser.open_relative(v) if r.ok: pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') - p = {k: abs_url(browser.url, pic['src'])} + p = {k: urljoin(browser.url, pic['src'])} else: p = {k, ''} o.update(p)