From b1fffc1fab8432388bf64393762d4330a518bd9e Mon Sep 17 00:00:00 2001 From: Stefal Date: Thu, 14 Sep 2023 23:27:03 +0200 Subject: [PATCH] conccurency on get_image_data --- download.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/download.py b/download.py index c32b33d..5903d5f 100644 --- a/download.py +++ b/download.py @@ -4,6 +4,7 @@ from requests.adapters import Retry import json import os import asyncio +import concurrent.futures import argparse from datetime import datetime import writer @@ -32,12 +33,11 @@ def background(f): return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs) return wrapped -#TODO add try/except and retry (see https://www.zenrows.com/blog/python-requests-retry#avoid-getting-blocked) -@background -def download(url, fn, metadata=None): +#@background +def download(url, filepath, metadata=None): r = session.get(url, stream=True, timeout=6) image = write_exif(r.content, metadata) - with open(str(fn), "wb") as f: + with open(str(filepath), "wb") as f: f.write(image) def get_single_image_data(image_id, mly_header): @@ -62,6 +62,31 @@ def get_image_data_from_sequences(sequences_id, mly_header): image_data['sequence_id'] = sequence_id yield image_data +def get_image_data_from_sequences__future(sequences_id, mly_header): + for i,sequence_id in enumerate(sequences_id): + url = 'https://graph.mapillary.com/image_ids?sequence_id={}'.format(sequence_id) + r = requests.get(url, headers=header) + data = r.json() + image_ids = data['data'] + total_image = len(image_ids) + print("{} images in sequence {} of {} - id : {}".format(total_image, i+1, len(sequences_id), sequence_id)) + print('getting images data') + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + future_to_url = {} + for x in range(0, total_image): + image_id = image_ids[x]['id'] + future_to_url[executor.submit(get_single_image_data, image_id, mly_header)] = image_id + for future in concurrent.futures.as_completed(future_to_url): + url = future_to_url[future] + image_data = future.result() + image_data['sequence_id'] = sequence_id + #print(image_data) + yield image_data + #image_data = get_single_image_data(image_id, mly_header) + #image_data['sequence_id'] = sequence_id + #yield image_data + def write_exif(picture, img_metadata): ''' Write exif metadata @@ -95,10 +120,12 @@ if __name__ == '__main__': if not os.path.exists('data'): os.makedirs('data') - for i,image_data in enumerate(get_image_data_from_sequences(sequence_ids, header)): + #for i,image_data in enumerate(get_image_data_from_sequences(sequence_ids, header)): + for i,image_data in enumerate(get_image_data_from_sequences__future(sequence_ids, header)): if args.image_limit is not None and i >= args.image_limit: break images_data.append(image_data) + #sys.exit() print('downloading.. this process will take a while. please wait') for i,image_data in enumerate(images_data): @@ -116,4 +143,6 @@ if __name__ == '__main__': direction = image_data['compass_angle'], altitude = image_data['altitude'], ) - download(image_data['thumb_original_url'],path, img_metadata) + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + executor.submit(download, url=image_data['thumb_original_url'], filepath=path, metadata=img_metadata) + #download(image_data['thumb_original_url'],path, img_metadata)