add tjnq.py tjgb.py miit.py internet.py

2022-05-18 17:54:33 +08:00
parent 8ec1ca4e19
commit 028fd96032
6 changed files with 288 additions and 18 deletions
--- a/code/internet.py
+++ b/code/internet.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Author: yiliyas
+# Date: 2022.5.18
+
+import re
+from urllib import request
+
+# 1.获取数据
+def get_html(url):
+    # https 模拟浏览器头
+    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
+    req = request.Request(url,headers=headers)
+    response=request.urlopen(req)
+    # 根据实际地址修改字符集
+    content=response.read().decode('utf-8')
+    return content
+
+# 2.处理数据（设置匹配条件）
+def get_url(content):
+    # 根据地址不同，判断设置不同的匹配条件
+    pattern = re.compile('<li><div.*?><a href=\'(.*?)\'.*?>(.*?)</a></div><div.*?>(.*?)</div></li>',re.S)
+    items = re.findall(pattern,content)
+    return items
+
+# 3.显示数据
+def show_result(items):
+    # 根据地址不同，判断设置不同的匹配条件
+    for item in items:
+        curl = item[0]
+        cname = item[1]
+        # 生成完整的链接地址
+        if str(cname).find('次')>-1:
+            curl=str(curl).replace('./','/')
+            curl = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg'+str(curl)
+            strs = cname+'|'+curl
+            # 显示链接信息
+            print(strs)
+            # 下载数据
+            get_file(curl)
+
+
+# 4.下载数据（写入文件）
+def get_file(url):
+    file_name = url.split('/')[-1]
+    # 设置下载路径（根据实际情况修改）
+    file_names='E:/download/'+file_name
+    req = request.Request(url)
+    u = request.urlopen(req)
+    f = open(file_names, 'wb')
+    block_sz = 8192
+    while True:
+        buffer = u.read(block_sz)
+        if not buffer:
+            break
+        f.write(buffer)
+    f.close()
+    print ("Sucessful to download" + " " + file_names)
+
+
+# 下载1~37次
+for i in range(1,7):
+    url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index_'+str(i)+'.htm'
+    # 获取信息
+    html = get_html(url)
+    # 获取下载地址
+    items = get_url(html)
+    # 下载显示数据
+    show_result(items)
+
+# 下载38~49次
+url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm'
+html = get_html(url)
+items = get_url(html)
+show_result(items)
+
--- a/code/miit.py
+++ b/code/miit.py
@@ -0,0 +1,46 @@
+# -*- coding:utf-8 -*-
+# Author: yiliyas
+# Date: 2022.5.18
+
+from urllib import request
+import re
+
+# 工信数据
+url = 'https://www.miit.gov.cn/gxsj/index.html'
+headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
+req = request.Request(url,headers=headers)
+response=request.urlopen(req)
+content=response.read().decode('utf-8-sig')
+
+# 获得最新行业数据
+pattern = re.compile('<p><a href="(.*?)".*?title="(.*?)">.*?</a><span>(.*?)</span></p>',re.S)
+items = re.findall(pattern,content)
+for item in items:
+    url=item[0]
+    uname=item[1]
+    utime=item[2]
+    url='https://www.miit.gov.cn'+url
+    if str(url).find('yclgy')>-1:
+        strs='|原材料工业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('zbgy')>-1:
+        strs='|装备工业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('xfpgy')>-1:
+        strs='|消费品工业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('txy')>-1:
+        strs='|通信业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('dzxx')>-1:
+        strs='|电子信息制造业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('rjy')>-1:
+        strs='|软件业'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('hlw')>-1:
+        strs='|互联网'+'|['+uname+']('+url+')|'+utime
+        print(strs)
+    if str(url).find('wlaq')>-1:
+        strs='|网络安全'+'|['+uname+']('+url+')|'+utime
+        print(strs)
--- a/code/tjgb.py
+++ b/code/tjgb.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Author: yiliyas
+# Date: 2022.5.18
+
+import re
+from urllib import request
+
+# 1.获取数据
+def get_html(url):
+    # https 模拟浏览器头
+    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
+    req = request.Request(url,headers=headers)
+    response=request.urlopen(req)
+    # 根据实际地址修改字符集
+    content=response.read().decode('utf-8')
+    return content
+
+# 2.处理数据（设置匹配条件）
+def get_url(content):
+    # 根据地址不同，判断设置不同的匹配条件
+    pattern = re.compile('<a href="(.*?)".*?><span.*?><img.*?><font.*?>(.*?)</font><font.*?>(.*?)</font></span></a>',re.S)
+    items = re.findall(pattern,content)
+    return items
+
+# 3.显示数据
+def show_result(items):
+    # 根据地址不同，判断设置不同的匹配条件
+    for item in items:
+        curl = item[0]
+        cname = item[1]
+        # 生成完整的链接地址
+        #http://www.stats.gov.cn/tjsj/zxfb/201602/t20160229_1323991.html
+        if str(cname).find('年')>-1:
+            cyear = str(cname).replace('年','')
+            iyear = int(cyear)
+        if iyear>2012:
+            curl = 'http://www.stats.gov.cn'+str(curl)
+        if iyear>=1978 and iyear<=2012:
+            curl=str(curl).replace('./','/')
+            curl = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb'+str(curl)
+        strs = '|['+cname+']('+curl+')'
+        # 显示链接信息
+        print(strs)
+        # 下载数据
+        get_file(curl)
+
+# 4.下载数据（写入文件）
+def get_file(url):
+    file_name = url.split('/')[-1]
+    # 修改路径
+    file_names='E:/download/'+file_name
+    req = request.Request(url)
+    u = request.urlopen(req)
+    f = open(file_names, 'wb')
+    block_sz = 8192
+    while True:
+        buffer = u.read(block_sz)
+        if not buffer:
+            break
+        f.write(buffer)
+    f.close()
+    print ("Sucessful to download" + " " + file_name)
+
+# 下载（2013~2021）
+# 2022-2021年
+#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index.html'
+# 1983-2001年
+#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_1.html'
+#
+url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_2.html'
+
+# 获取信息
+html = get_html(url)
+# 获取下载地址
+items = get_url(html)
+# 下载显示数据
+show_result(items)
+
+
--- a/code/tjnq.py
+++ b/code/tjnq.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# Author: yiliyas
+# Date: 2022.5.18
+
+import re
+from urllib import request
+
+# 1.获取数据
+def get_html(url):
+    # https 模拟浏览器头
+    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
+    req = request.Request(url,headers=headers)
+    response=request.urlopen(req)
+    content=response.read().decode('gb2312')
+    return content
+
+# 2.处理数据（设置匹配条件）
+def get_url(content):
+    # 根据地址不同，判断设置不同的匹配条件
+    pattern = re.compile('<li><a href=\'(.*?)\'>(.*?)</a></li>',re.S)
+    items = re.findall(pattern,content)
+    return items
+
+# 3.显示数据
+def show_result(items):
+    # 根据地址不同，判断设置不同的匹配条件
+    for item in items:
+        curl = item[0]
+        cname = item[1]
+        # 生成完整的链接地址
+        curl = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/'+str(curl)
+        strs = '|['+cname+']('+curl+')'
+        # 显示链接信息
+        print(strs)
+        # 下载数据
+        get_file(curl)
+
+# 4.下载数据（写入文件）
+def get_file(url):
+    file_name = url.split('/')[-1]
+    # 修改路径（文件名加入年份，避免重复）
+    file_names='E:/download/'+str(iyear)+file_name
+    req = request.Request(url)
+    u = request.urlopen(req)
+    f = open(file_names, 'wb')
+    block_sz = 8192
+    while True:
+        buffer = u.read(block_sz)
+        if not buffer:
+            break
+        f.write(buffer)
+    f.close()
+    print ("Sucessful to download" + " " + file_names)
+
+# 下载（2005~2021）
+# 定义全局变量：iyear 历年
+global iyear
+#url = 'http://www.stats.gov.cn/tjsj/ndsj/2021/left.htm'
+for iyear in range(2005,2021):
+    url = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/left.htm'
+    # 获取信息
+    html = get_html(url)
+    # 获取下载地址
+    items = get_url(html)
+    # 下载显示数据
+    show_result(items)
+
+