From 3cce3151007ca2e9b7c1c55c0b970405d6645173 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 11 Jun 2021 13:53:08 +0800
Subject: [PATCH] convert image relative url to absolute url

---
 ADC_function.py      |  7 +++++++
 WebCrawler/airav.py  | 11 ++++-------
 WebCrawler/javbus.py | 10 +++-------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index 6593006..6fe0370 100644
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -9,6 +9,7 @@ import time
 from lxml import etree
 import re
 import config
+from urllib.parse import urljoin
 
 
 def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
@@ -576,3 +577,9 @@ def is_link(filename: str):
     elif os.stat(filename).st_nlink > 1:
         return True # hard link Linux MAC OSX Windows NTFS
     return False
+
+# URL相对路径转绝对路径
+def abs_url(base_url: str, href: str) -> str:
+    if href.startswith('http'):
+        return href
+    return urljoin(base_url, href)
diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py
index b0889f9..87efcc8 100644
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -25,7 +25,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
         l=i.a['href']
         t=i.get_text()
         html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
+        p=abs_url("https://www.javbus.com",
+                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
         p2={t:p}
         d.update(p2)
     return d
@@ -59,7 +60,7 @@ def getYear(htmlcode):   #获取年份
 def getCover(htmlcode):  #获取封面链接
     doc = pq(htmlcode)
     image = doc('a.bigImage')
-    return image.attr('href')
+    return abs_url("https://www.javbus.com", image.attr('href'))
 def getRelease(htmlcode): #获取出版日期
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
@@ -226,9 +227,5 @@ def main(number):
 
 if __name__ == '__main__':
     #print(main('ADN-188'))
-
     print(main('ADN-188'))
-    print(main('012717_472'))
-    print(main('080719-976'))
-    print(main('姫川ゆうな'))
-
+    print(main('CJOD-278'))
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index d89c9dc..d378e0e 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -17,7 +17,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
         l=i.a['href']
         t=i.get_text()
         html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
+        p=abs_url("https://www.javbus.com",
+                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
         p2={t:p}
         d.update(p2)
     return d
@@ -47,12 +48,7 @@ def getYear(htmlcode):   #获取年份
 def getCover(htmlcode):  #获取封面链接
     doc = pq(htmlcode)
     image = doc('a.bigImage')
-    uri = image.attr('href')
-    if uri.startswith('http'):
-        return uri
-    if uri[0] != '/':
-        uri = '/' + uri
-    return "https://www.javbus.com" + uri
+    return abs_url("https://www.javbus.com", image.attr('href'))
 def getRelease(htmlcode): #获取出版日期
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")