站点sitemap.xml内容格式
Python简单实现
简单实现,提取URL保存到url.txt文件
import xml.dom.minidom as xmldom
import urllib.request
import xml
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
}
##设置站点地图 URL链接
sitemap_url = "https://www.nstns.com/sitemap.xml"
http = urllib.request.Request(url=sitemap_url,headers=headers);
#发送请求
http_run = urllib.request.urlopen(http);
dom = xml.dom.minidom.parse(http_run)
return_xml = dom.documentElement.getElementsByTagName("url")
open("D:\\url.txt", 'w').close()
for url_xml in return_xml:
url = url_xml.getElementsByTagName("loc")[0]
a_url = url.firstChild.data
#print(a_url)
file = open("D:\\url.txt",'a',encoding="utf-8")
file.write(a_url+'\n')
file.close()
continue