- #1. epoll 并不代表一定比 select 好
- # 在并发高的情况下, 连接活跃度不是很高, epoll 比 select
- # 并发性不高, 同时连接很活跃, select 比 epoll 好
- # 通过非阻塞 io 实现 http 请求
- # select + 回调 + 事件循环
- # 并发性高
- # 使用单线程
- import socket
- from urllib.parse import urlparse
- from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
- selector = DefaultSelector()
- # 使用 select 完成 http 请求
- urls = []
- stop = False
- class Fetcher:
- def connected(self, key):
- selector.unregister(key.fd)
- self.client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(self.path, self.host).encode("utf8"))
- selector.register(self.client.fileno(), EVENT_READ, self.readable)
- def readable(self, key):
- d = self.client.recv(1024)
- if d:
- self.data += d
- else:
- selector.unregister(key.fd)
- data = self.data.decode("utf8")
- html_data = data.split("\r\n\r\n")[1]
- print(html_data)
- self.client.close()
- urls.remove(self.spider_url)
- if not urls:
- global stop
- stop = True
- def get_url(self, url):
- self.spider_url = url
- url = urlparse(url)
- self.host = url.netloc
- self.path = url.path
- self.data = b""if self.path =="":
- self.path = "/"
- # 建立 socket 连接
- self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- self.client.setblocking(False)
- try:
- self.client.connect((self.host, 80)) # 阻塞不会消耗 cpu
- except BlockingIOError as e:
- pass
- #注册
- selector.register(self.client.fileno(), EVENT_WRITE, self.connected)
- def loop():
- #事件循环, 不停的请求 socket 的状态并调用对应的回调函数
- #1. select 本身是不支持 register 模式
- #2. socket 状态变化以后的回调是由程序员完成的
- while not stop:
- ready = selector.select()
- for key, mask in ready:
- call_back = key.data
- call_back(key)
- #回调 + 事件循环 + select(poll\epoll)
- if __name__ == "__main__":
- fetcher = Fetcher()
- import time
- start_time = time.time()
- for url in range(20):
- url = "http://shop.projectsedu.com/goods/{}/".format(url)
- urls.append(url)
- fetcher = Fetcher()
- fetcher.get_url(url)
- loop()
- print(time.time()-start_time)
- # def get_url(url):
- # #通过 socket 请求 html
- # url = urlparse(url)
- # host = url.netloc
- # path = url.path
- # if path == "":
- # path = "/"
- #
- # #建立 socket 连接
- # client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- # client.setblocking(False)
- # try:
- # client.connect((host, 80)) #阻塞不会消耗 cpu
- # except BlockingIOError as e:
- # pass
- #
- # #不停的询问连接是否建立好, 需要 while 循环不停的去检查状态
- # #做计算任务或者再次发起其他的连接请求
- #
- # while True:
- # try:
- # client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
- # break
- # except OSError as e:
- # pass
- #
- #
- # data = b""
- # while True:
- # try:
- # d = client.recv(1024)
- # except BlockingIOError as e:
- # continue
- # if d:
- # data += d
- # else:
- # break
- #
- # data = data.decode("utf8")
- # html_data = data.split("\r\n\r\n")[1]
- # print(html_data)
- # client.close()
来源: http://www.bubuko.com/infodetail-2572863.html