add tjnq.py tjgb.py miit.py internet.py

This commit is contained in:
yiliyassh@163.com
2022-05-18 17:54:33 +08:00
parent 8ec1ca4e19
commit 028fd96032
6 changed files with 288 additions and 18 deletions

76
code/internet.py Normal file
View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.18
import re
from urllib import request
# 1.获取数据
def get_html(url):
# https 模拟浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
# 根据实际地址修改字符集
content=response.read().decode('utf-8')
return content
# 2.处理数据(设置匹配条件)
def get_url(content):
# 根据地址不同,判断设置不同的匹配条件
pattern = re.compile('<li><div.*?><a href=\'(.*?)\'.*?>(.*?)</a></div><div.*?>(.*?)</div></li>',re.S)
items = re.findall(pattern,content)
return items
# 3.显示数据
def show_result(items):
# 根据地址不同,判断设置不同的匹配条件
for item in items:
curl = item[0]
cname = item[1]
# 生成完整的链接地址
if str(cname).find('')>-1:
curl=str(curl).replace('./','/')
curl = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg'+str(curl)
strs = cname+'|'+curl
# 显示链接信息
print(strs)
# 下载数据
get_file(curl)
# 4.下载数据(写入文件)
def get_file(url):
file_name = url.split('/')[-1]
# 设置下载路径(根据实际情况修改)
file_names='E:/download/'+file_name
req = request.Request(url)
u = request.urlopen(req)
f = open(file_names, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_names)
# 下载1~37次
for i in range(1,7):
url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index_'+str(i)+'.htm'
# 获取信息
html = get_html(url)
# 获取下载地址
items = get_url(html)
# 下载显示数据
show_result(items)
# 下载38~49次
url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm'
html = get_html(url)
items = get_url(html)
show_result(items)

46
code/miit.py Normal file
View File

@@ -0,0 +1,46 @@
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.18
from urllib import request
import re
# 工信数据
url = 'https://www.miit.gov.cn/gxsj/index.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
content=response.read().decode('utf-8-sig')
# 获得最新行业数据
pattern = re.compile('<p><a href="(.*?)".*?title="(.*?)">.*?</a><span>(.*?)</span></p>',re.S)
items = re.findall(pattern,content)
for item in items:
url=item[0]
uname=item[1]
utime=item[2]
url='https://www.miit.gov.cn'+url
if str(url).find('yclgy')>-1:
strs='|原材料工业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('zbgy')>-1:
strs='|装备工业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('xfpgy')>-1:
strs='|消费品工业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('txy')>-1:
strs='|通信业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('dzxx')>-1:
strs='|电子信息制造业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('rjy')>-1:
strs='|软件业'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('hlw')>-1:
strs='|互联网'+'|['+uname+']('+url+')|'+utime
print(strs)
if str(url).find('wlaq')>-1:
strs='|网络安全'+'|['+uname+']('+url+')|'+utime
print(strs)

80
code/tjgb.py Normal file
View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.18
import re
from urllib import request
# 1.获取数据
def get_html(url):
# https 模拟浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
# 根据实际地址修改字符集
content=response.read().decode('utf-8')
return content
# 2.处理数据(设置匹配条件)
def get_url(content):
# 根据地址不同,判断设置不同的匹配条件
pattern = re.compile('<a href="(.*?)".*?><span.*?><img.*?><font.*?>(.*?)</font><font.*?>(.*?)</font></span></a>',re.S)
items = re.findall(pattern,content)
return items
# 3.显示数据
def show_result(items):
# 根据地址不同,判断设置不同的匹配条件
for item in items:
curl = item[0]
cname = item[1]
# 生成完整的链接地址
#http://www.stats.gov.cn/tjsj/zxfb/201602/t20160229_1323991.html
if str(cname).find('')>-1:
cyear = str(cname).replace('','')
iyear = int(cyear)
if iyear>2012:
curl = 'http://www.stats.gov.cn'+str(curl)
if iyear>=1978 and iyear<=2012:
curl=str(curl).replace('./','/')
curl = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb'+str(curl)
strs = '|['+cname+']('+curl+')'
# 显示链接信息
print(strs)
# 下载数据
get_file(curl)
# 4.下载数据(写入文件)
def get_file(url):
file_name = url.split('/')[-1]
# 修改路径
file_names='E:/download/'+file_name
req = request.Request(url)
u = request.urlopen(req)
f = open(file_names, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
# 下载2013~2021
# 2022-2021年
#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index.html'
# 1983-2001年
#url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_1.html'
#
url = 'http://www.stats.gov.cn/tjsj/tjgb/ndtjgb/index_2.html'
# 获取信息
html = get_html(url)
# 获取下载地址
items = get_url(html)
# 下载显示数据
show_result(items)

69
code/tjnq.py Normal file
View File

@@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.18
import re
from urllib import request
# 1.获取数据
def get_html(url):
# https 模拟浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
content=response.read().decode('gb2312')
return content
# 2.处理数据(设置匹配条件)
def get_url(content):
# 根据地址不同,判断设置不同的匹配条件
pattern = re.compile('<li><a href=\'(.*?)\'>(.*?)</a></li>',re.S)
items = re.findall(pattern,content)
return items
# 3.显示数据
def show_result(items):
# 根据地址不同,判断设置不同的匹配条件
for item in items:
curl = item[0]
cname = item[1]
# 生成完整的链接地址
curl = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/'+str(curl)
strs = '|['+cname+']('+curl+')'
# 显示链接信息
print(strs)
# 下载数据
get_file(curl)
# 4.下载数据(写入文件)
def get_file(url):
file_name = url.split('/')[-1]
# 修改路径(文件名加入年份,避免重复)
file_names='E:/download/'+str(iyear)+file_name
req = request.Request(url)
u = request.urlopen(req)
f = open(file_names, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_names)
# 下载2005~2021
# 定义全局变量iyear 历年
global iyear
#url = 'http://www.stats.gov.cn/tjsj/ndsj/2021/left.htm'
for iyear in range(2005,2021):
url = 'http://www.stats.gov.cn/tjsj/ndsj/'+str(iyear)+'/left.htm'
# 获取信息
html = get_html(url)
# 获取下载地址
items = get_url(html)
# 下载显示数据
show_result(items)