From 3cce3151007ca2e9b7c1c55c0b970405d6645173 Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 11 Jun 2021 13:53:08 +0800 Subject: [PATCH] convert image relative url to absolute url --- ADC_function.py | 7 +++++++ WebCrawler/airav.py | 11 ++++------- WebCrawler/javbus.py | 10 +++------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 6593006..6fe0370 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -9,6 +9,7 @@ import time from lxml import etree import re import config +from urllib.parse import urljoin def get_data_state(data: dict) -> bool: # 元数据获取失败检测 @@ -576,3 +577,9 @@ def is_link(filename: str): elif os.stat(filename).st_nlink > 1: return True # hard link Linux MAC OSX Windows NTFS return False + +# URL相对路径转绝对路径 +def abs_url(base_url: str, href: str) -> str: + if href.startswith('http'): + return href + return urljoin(base_url, href) diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index b0889f9..87efcc8 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -25,7 +25,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") + p=abs_url("https://www.javbus.com", + str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) return d @@ -59,7 +60,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - return image.attr('href') + return abs_url("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") @@ -226,9 +227,5 @@ def main(number): if __name__ == '__main__': #print(main('ADN-188')) - print(main('ADN-188')) - print(main('012717_472')) - print(main('080719-976')) - print(main('姫川ゆうな')) - + print(main('CJOD-278')) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index d89c9dc..d378e0e 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -17,7 +17,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img l=i.a['href'] t=i.get_text() html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") + p=abs_url("https://www.javbus.com", + str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) return d @@ -47,12 +48,7 @@ def getYear(htmlcode): #获取年份 def getCover(htmlcode): #获取封面链接 doc = pq(htmlcode) image = doc('a.bigImage') - uri = image.attr('href') - if uri.startswith('http'): - return uri - if uri[0] != '/': - uri = '/' + uri - return "https://www.javbus.com" + uri + return abs_url("https://www.javbus.com", image.attr('href')) def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")