- #!/usr/bin/env python2
- # encoding: utf-8
- """
- # @brief: 抓某个网页上的jpg和jpeg图片
- Usage:
- python fetchjpg.py http://example.com/
- """
- from bs4 import BeautifulSoup as bs
- from urllib2 import urlopen
- from urllib import urlretrieve
- import os
- import sys
- def random_suffix(jpgname):
- import random
- t = jpgname.split('.')
- t[1] = t[1] + str(random.random())[2-6]
- return '.'.join(t)
- def get_domain_name(url):
- return 'http://' + (''.join(url.split('//')[1:])).split('/')[0]
- def get_image_url(image_webpage_url):
- """DOC: # @brief: get_image_url : 获取图片的url """
- image_url = []
- soup = bs(urlopen(url).read())
- url_name = soup.html.title.string
- image_url.append(url_name)
- for image in soup.findAll('img'):
- if image.has_attr('src'):
- if "jpg" in image["src"] or "jpeg" in image["src"]:
- if image['src'][0] == '/':
- image_url.append(get_domain_name(image_webpage_url) +
- image["src"])
- else:
- image_url.append(image["src"])
- return image_url
- def get_image(url, local_folder):
- """DOC:
- # @param: url 网页网址
- # @param: local_folder 本地保存目录
- """
- if os.path.isdir(local_folder):
- i = 0
- for image in get_image_url(url):
- if i == 0:
- image_page_name = image.replace('.', '')
- i = 1
- continue
- filename = image_page_name + '_' + image.split("/")[-1]
- outpath = os.path.join(local_folder, filename)
- """
- if os.path.exists(outpath):
- print filename + u'已存在, 跳过'
- continue
- """
- if os.path.exists(outpath):
- filename = random_suffix(filename)
- outpath = os.path.join(local_folder, filename)
- import socket
- socket.setdefaulttimeout(60)
- urlretrieve(image, outpath)
- print image + " ==> " + outpath
- def _usage():
- print "usage: python fetchjpg.py http://example.com [outpath]"
- if __name__ == "__main__":
- #print webpage_charset('http://www.example.com')
- l = len(sys.argv[1:])
- if l == 0 or l > 2:
- _usage()
- sys.exit(-1)
- if l == 1:
- # 默认本地保存路径
- out_folder = "/media/E/pics"
- url = sys.argv[-1]
- else:
- url = sys.argv[1]
- out_folder = sys.argv[2]
- if not url.lower().startswith("http"):
- _usage()
- sys.exit(-1)
- get_image(url, out_folder)
- #该片段来自于http://www.codesnippet.cn/detail/181120137242.html
来源: http://www.codesnippet.cn/detail/181120137242.html