convert image relative url to absolute url

This commit is contained in:
lededev
2021-06-11 13:53:08 +08:00
parent 8fbe101196
commit 3cce315100
3 changed files with 14 additions and 14 deletions

View File

@@ -9,6 +9,7 @@ import time
from lxml import etree from lxml import etree
import re import re
import config import config
from urllib.parse import urljoin
def get_data_state(data: dict) -> bool: # 元数据获取失败检测 def get_data_state(data: dict) -> bool: # 元数据获取失败检测
@@ -576,3 +577,9 @@ def is_link(filename: str):
elif os.stat(filename).st_nlink > 1: elif os.stat(filename).st_nlink > 1:
return True # hard link Linux MAC OSX Windows NTFS return True # hard link Linux MAC OSX Windows NTFS
return False return False
# URL相对路径转绝对路径
def abs_url(base_url: str, href: str) -> str:
if href.startswith('http'):
return href
return urljoin(base_url, href)

View File

@@ -25,7 +25,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p=abs_url("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
@@ -59,7 +60,7 @@ def getYear(htmlcode): #获取年份
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return image.attr('href') return abs_url("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
@@ -226,9 +227,5 @@ def main(number):
if __name__ == '__main__': if __name__ == '__main__':
#print(main('ADN-188')) #print(main('ADN-188'))
print(main('ADN-188')) print(main('ADN-188'))
print(main('012717_472')) print(main('CJOD-278'))
print(main('080719-976'))
print(main('姫川ゆうな'))

View File

@@ -17,7 +17,8 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p=abs_url("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
@@ -47,12 +48,7 @@ def getYear(htmlcode): #获取年份
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
uri = image.attr('href') return abs_url("https://www.javbus.com", image.attr('href'))
if uri.startswith('http'):
return uri
if uri[0] != '/':
uri = '/' + uri
return "https://www.javbus.com" + uri
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")