1,
urllib.request.urlretrieve 可以根据文件的 URL 下载文件:
- # -*- coding: utf-8 -*-
- from urllib.request import urlretrieve
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- html = urlopen("http://www.pythonscraping.com/")
- bsObj = BeautifulSoup(html, "lxml")
- imageLocation = bsObj.find("a", {"id":"logo"}).find("img")["src"]
- #print(imageLocation)
- urlretrieve(imageLocation, "logo.jpg")
这段程序从 http://pythonscraping.com 下载 logo 图片, 然后在程序运行的文件夹里保存为 logo.jpg 文件.
下面的程序会把 http://pythonscraping.com 主页上所有 src 属性且图片后缀为. jpg 的文件都下载下来:
- # -*- coding: utf-8 -*-
- import os
- from urllib.request import urlretrieve
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- downloadDirectory = "downloaded"
- baseUrl = "http://pythonscraping.com"
- def getAbsoluteURL(baseUrl, source):
- if source.startswith("http://www."):
- url = "http://"+source[11:]
- elif source.startswith("http://"):
- url = source
- elif source.startswith("www."):
- url = source[4:]
- url = "http://"+source
- else:
- url = baseUrl+"/"+source
- if baseUrl not in url:
- return None
- return url
- def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
- path = absoluteUrl.replace("www.", "")
- path = path.replace(baseUrl, "")
- path = downloadDirectory + path
- if path.endswith(".jpg"):
- directory = os.path.dirname(path)
- if not os.path.exists(directory):
- os.makedirs(directory)
- #print(path)
- return path
- html = urlopen("http://www.pythonscraping.com")
- bsObj = BeautifulSoup(html, "lxml")
- downloadList = bsObj.findAll(src=True)
- for download in downloadList:
- #print(download["src"])
- fileUrl = getAbsoluteURL(baseUrl, download["src"])
- if fileUrl is not None:
- print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
- 2,
- # -*- coding: utf-8 -*-
- import csv
- csvFile = open("test.csv", 'w+')
- try:
- writer = csv.writer(csvFile)
- writer.writerow(('number', 'number plus 2', 'number times 2'))
- for i in range(10):
writer.writerow( (i, i+2, i*2))
- finally:
- csvFile.close()
运行以上代码后, 你会看到一个 CSV 文件:
number | number plus 2 | number times 2 |
0 | 2 | 0 |
1 | 3 | 2 |
2 | 4 | 4 |
3 | 5 | 6 |
4 | 6 | 8 |
5 | 7 | 10 |
6 | 8 | 12 |
7 | 9 | 14 |
8 | 10 | 16 |
9 | 11 | 18 |
获取维基百科词条中的 HTML 表格并写入 CSV 文件.
- # -*- coding: utf-8 -*-
- import csv
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
- bsObj = BeautifulSoup(html, "lxml")
- # 主对比表格是当前页面上的第一个表格
- table = bsObj.findAll("table",{"class":"wikitable"})[0]
- rows = table.findAll("tr")
csvFile = open("editors.csv", 'wt', newline="", encoding='utf-8')
- writer = csv.writer(csvFile)
- try:
- for row in rows:
- csvRow = []
- for cell in row.findAll(['td', 'th']):
- csvRow.append(cell.get_text())
- writer.writerow(csvRow)
- finally:
- csvFile.close()
来源: http://www.bubuko.com/infodetail-2579763.html