改爲使用🗾日本語Metadata，同時對監修、Studio刮削進行細微調整

2020-09-15 00:05:17 +08:00
parent a2793e2723
commit 5d4bc3454a
1 changed files with 35 additions and 20 deletions
@@ -29,9 +29,16 @@ def getTitle(htmlcode):  #获取标题
        return title2
    except:
        return title
-def getStudio(htmlcode): #获取厂商
+def getStudio(htmlcode): #获取厂商 已修改
    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
+    # 如果记录中冇导演，厂商排在第4位
+    if 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
+    # 如果记录中有导演，厂商排在第5位
+    elif 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
+    else:
+        result = ''
    return result
 def getYear(htmlcode):   #获取年份
    html = etree.fromstring(htmlcode,etree.HTMLParser())
@@ -45,10 +52,10 @@ def getRelease(htmlcode): #获取出版日期
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
    return result
-def getRuntime(htmlcode): #获取分钟
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = soup.find(text=re.compile('分鐘'))
-    return a
+def getRuntime(htmlcode): #获取分钟 已修改
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
+    return result
 def getActor(htmlcode):   #获取女优
    b=[]
    soup=BeautifulSoup(htmlcode,'lxml')
@@ -60,9 +67,12 @@ def getNum(htmlcode):     #获取番号
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
    return result
-def getDirector(htmlcode): #获取导演
+def getDirector(htmlcode): #获取导演 已修改
    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
+    if '監督:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
+    else:
+        result = ''         # 记录中有可能没有导演数据
    return result
 def getCID(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -77,14 +87,18 @@ def getOutline(htmlcode):  #获取演员
        return result
    except:
        return ''
-def getSerise(htmlcode):
-    try:
-        html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getSerise(htmlcode):   #获取系列 已修改
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    # 如果记录中冇导演，系列排在第6位
+    if 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
+        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
+    # 如果记录中有导演，系列排在第7位
+    elif 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
-        return result
-    except:
-        return ''
-def getTag(htmlcode):  # 获取演员
+    else:
+        result = ''
+    return result
+def getTag(htmlcode):  # 获取标签
    tag = []
    soup = BeautifulSoup(htmlcode, 'lxml')
    a = soup.find_all(attrs={'class': 'genre'})
@@ -94,10 +108,11 @@ def getTag(htmlcode):  # 获取演员
        tag.append(i.get_text())
    return tag

+
 def main_uncensored(number):
-    htmlcode = get_html('https://www.javbus.com/' + number)
+    htmlcode = get_html('https://www.javbus.com/ja/' + number)
    if getTitle(htmlcode) == '':
-        htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
+        htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
    try:
        dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
    except:
@@ -117,7 +132,7 @@ def main_uncensored(number):
        'label': getSerise(htmlcode),
        'imagecut': 0,
        'actor_photo': '',
-        'website': 'https://www.javbus.com/' + number,
+        'website': 'https://www.javbus.com/ja/' + number,
        'source': 'javbus.py',
        'series': getSerise(htmlcode),
    }
@@ -128,7 +143,7 @@ def main_uncensored(number):
 def main(number):
    try:
        try:
-            htmlcode = get_html('https://www.javbus.com/' + number)
+            htmlcode = get_html('https://www.javbus.com/ja/' + number)
            try:
                dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
            except:
@@ -148,7 +163,7 @@ def main(number):
                'tag': getTag(htmlcode),
                'label': getSerise(htmlcode),
                'actor_photo': getActorPhoto(htmlcode),
-                'website': 'https://www.javbus.com/' + number,
+                'website': 'https://www.javbus.com/ja/' + number,
                'source': 'javbus.py',
                'series': getSerise(htmlcode),
            }