I am opening a file which has 100,000 URL's. I need to send an HTTP request to each URL and print the status code. I am using Python 2.6, and so far looked at the many confusing ways Python implements threading/concurrency. I have even looked at the python concurrence library, but cannot figure out how to write this program correctly. Has anyone come across a similar problem? I guess generally I need to know how to perform thousands of tasks in Python as fast as possible - I suppose that means 'concurrently'.
当前回答
使用grequests,它是requests + Gevent模块的组合。
GRequests允许您使用带有Gevent的Requests来轻松地生成异步HTTP请求。
用法很简单:
import grequests
urls = [
'http://www.heroku.com',
'http://tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://kennethreitz.com'
]
创建一组未发送的请求:
>>> rs = (grequests.get(u) for u in urls)
同时发送:
>>> grequests.map(rs)
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>]
其他回答
使用线程池是一个很好的选择,这将使这相当容易。不幸的是,python并没有一个标准库来简化线程池。但这里有一个不错的图书馆,你应该开始: http://www.chrisarndt.de/projects/threadpool/
来自他们网站的代码示例:
pool = ThreadPool(poolsize)
requests = makeRequests(some_callable, list_of_args, callback)
[pool.putRequest(req) for req in requests]
pool.wait()
希望这能有所帮助。
Scrapy框架将快速和专业地解决您的问题。它还将缓存所有请求,以便稍后可以重新运行失败的请求。
将该脚本保存为quotes_spider.py。
# quote_spiders.py
import json
import string
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Item, Field
class TextCleaningPipeline(object):
def _clean_text(self, text):
text = text.replace('“', '').replace('”', '')
table = str.maketrans({key: None for key in string.punctuation})
clean_text = text.translate(table)
return clean_text.lower()
def process_item(self, item, spider):
item['text'] = self._clean_text(item['text'])
return item
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open(spider.settings['JSON_FILE'], 'a')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class QuoteItem(Item):
text = Field()
author = Field()
tags = Field()
spider = Field()
class QuoteSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
# ...
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item
if __name__ == '__main__':
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['HTTPCACHE_ENABLED'] = True
settings['CONCURRENT_REQUESTS'] = 20
settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 20
settings['JSON_FILE'] = 'items.jl'
settings['ITEM_PIPELINES'] = dict()
settings['ITEM_PIPELINES']['__main__.TextCleaningPipeline'] = 800
settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 801
process = CrawlerProcess(settings=settings)
process.crawl(QuoteSpider)
process.start()
紧随其后的是
$ pip install Scrapy
$ python quote_spiders.py
为了微调scraper,相应地调整CONCURRENT_REQUESTS和CONCURRENT_REQUESTS_PER_DOMAIN设置。
自从2010年这篇文章发布以来,事情发生了很大的变化,我还没有尝试过所有其他的答案,但我尝试了一些,我发现使用python3.6对我来说这是最好的。
在AWS上运行时,我每秒可以获取大约150个独特的域名。
import concurrent.futures
import requests
import time
out = []
CONNECTIONS = 100
TIMEOUT = 5
tlds = open('../data/sample_1k.txt').read().splitlines()
urls = ['http://{}'.format(x) for x in tlds[1:]]
def load_url(url, timeout):
ans = requests.head(url, timeout=timeout)
return ans.status_code
with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
future_to_url = (executor.submit(load_url, url, TIMEOUT) for url in urls)
time1 = time.time()
for future in concurrent.futures.as_completed(future_to_url):
try:
data = future.result()
except Exception as exc:
data = str(type(exc))
finally:
out.append(data)
print(str(len(out)),end="\r")
time2 = time.time()
print(f'Took {time2-time1:.2f} s')
考虑使用风车,虽然风车可能不能做那么多线程。
您可以在5台机器上使用手卷Python脚本,每台机器使用端口40000-60000连接出站,打开100,000个端口连接。
另外,使用一个线程良好的QA应用程序(如OpenSTA)做一个示例测试可能会有所帮助,以了解每个服务器可以处理多少。
另外,试着在LWP::ConnCache类中使用简单的Perl。这样您可能会获得更好的性能(更多的连接)。
(下一个项目的自我提示)
Python 3解决方案只使用请求。它是最简单且快速的,不需要多处理或复杂的异步库。
最重要的方面是重用连接,特别是对于HTTPS (TLS需要额外的往返才能打开)。注意,连接是特定于子域的。如果在多个域上抓取多个页面,则可以对url列表进行排序,以最大化连接重用(它有效地按域进行排序)。
当给定足够的线程时,它将与任何异步代码一样快。(请求在等待响应时释放python GIL)。
[带有日志记录和错误处理的生产等级代码]
import logging
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
# source: https://stackoverflow.com/a/68583332/5994461
THREAD_POOL = 16
# This is how to create a reusable connection pool with python requests.
session = requests.Session()
session.mount(
'https://',
requests.adapters.HTTPAdapter(pool_maxsize=THREAD_POOL,
max_retries=3,
pool_block=True)
)
def get(url):
response = session.get(url)
logging.info("request was completed in %s seconds [%s]", response.elapsed.total_seconds(), response.url)
if response.status_code != 200:
logging.error("request failed, error code %s [%s]", response.status_code, response.url)
if 500 <= response.status_code < 600:
# server is overloaded? give it a break
time.sleep(5)
return response
def download(urls):
with ThreadPoolExecutor(max_workers=THREAD_POOL) as executor:
# wrap in a list() to wait for all requests to complete
for response in list(executor.map(get, urls)):
if response.status_code == 200:
print(response.content)
def main():
logging.basicConfig(
format='%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S'
)
urls = [
"https://httpstat.us/200",
"https://httpstat.us/200",
"https://httpstat.us/200",
"https://httpstat.us/404",
"https://httpstat.us/503"
]
download(urls)
if __name__ == "__main__":
main()
推荐文章
- 什么是HTTP“主机”报头?
- Numpy Max vs amax vs maximum
- 我应该在.gitignore文件中添加Django迁移文件吗?
- 每n行有熊猫
- 实例属性attribute_name定义在__init__之外
- 如何获取在Python中捕获的异常的名称?
- 第一次出现的值大于现有值的Numpy
- 如何从Python函数中返回两个值?
- 前一个月的Python日期
- Python中方括号括起来的列表和圆括号括起来的列表有什么区别?
- Python日志记录不输出任何东西
- 每n秒运行特定代码
- SQLAlchemy是否有与Django的get_or_create等价的函数?
- 如何将python datetime转换为字符串,具有可读格式的日期?
- 美丽的汤和提取div及其内容的ID