mapillary_download/mapillary_download.py

281 lines
10 KiB
Python
Raw Normal View History

2023-01-20 04:19:37 +01:00
import requests
2023-09-14 20:57:41 +02:00
from requests.adapters import HTTPAdapter
from requests.adapters import Retry
from PIL import Image
import io
2023-01-20 04:19:37 +01:00
import os
2023-09-14 23:27:03 +02:00
import concurrent.futures
2023-01-20 04:19:37 +01:00
import argparse
2023-09-10 20:04:26 +02:00
from datetime import datetime
import writer
2023-09-12 14:31:10 +02:00
from model import PictureType
2023-09-14 00:05:12 +02:00
import sys
2023-01-20 04:19:37 +01:00
2023-09-14 20:57:41 +02:00
session = requests.Session()
retries_strategies = Retry(
total=5,
backoff_factor=1,
status_forcelist=[429, 502, 503, 504],
)
session.mount("https://", HTTPAdapter(max_retries=retries_strategies))
2023-09-14 20:57:41 +02:00
def parse_args(argv=None):
2023-01-20 04:19:37 +01:00
parser = argparse.ArgumentParser()
parser.add_argument("--access_token", type=str, help="Your mapillary access token")
parser.add_argument(
"--sequence_ids",
type=str,
nargs="*",
help="The mapillary sequence id(s) to download",
)
parser.add_argument(
"--image_ids",
type=int,
nargs="*",
help="The mapillary image id(s) to get their sequence id(s)",
)
parser.add_argument(
"--destination",
type=str,
default="data",
help="Path destination for the images",
)
parser.add_argument(
"--image_limit",
type=int,
default=None,
help="How many images you want to download",
)
parser.add_argument(
"--username",
type=str,
default=None,
help="The username to separate sequences for each user",
)
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="overwrite existing images",
)
parser.add_argument("-v", "--version", action="version", version="release 1.6")
2023-01-20 04:19:37 +01:00
args = parser.parse_args(argv)
2023-09-20 12:21:14 +02:00
if args.sequence_ids is None and args.image_ids is None:
parser.error("Please enter at least one sequence id or image id")
2023-09-20 21:28:32 +02:00
return args
2023-01-20 04:19:37 +01:00
def download(url, filepath, metadata=None):
# print(asizeof.asizeof(image)/1024, "MB")
2023-09-14 23:27:03 +02:00
with open(str(filepath), "wb") as f:
2023-09-16 15:09:17 +02:00
r = session.get(url, stream=True, timeout=6)
try:
img = write_exif(r.content, metadata)
except Exception as e:
print(
f"{filepath} FAILED to write exif data. Error: {e} Retrying with reduced EXIF.".replace(
"\n", " | "
)
)
# write_exif(img_byte_arr, metadata) crashes when JFIF fields present
# so here is a workaround to remove those fields with pillow
# definitely not the most elegant solution...
try:
r = session.get(url, stream=True, timeout=6)
im = Image.open(r.raw)
exif_fields = list(im.info.keys())
# print(f"{filepath} detected exif fields : {exif_fields}")
fields_to_keep = ("exif", "dpi")
for k in exif_fields:
if k not in fields_to_keep:
del im.info[k]
# print(f"{filepath} deleted exif field: {k}")
# done cleaning, now converting pillow image back to bytearray
img_byte_arr = io.BytesIO()
im.save(img_byte_arr, format="JPEG")
img_byte_arr = img_byte_arr.getvalue()
img = write_exif(img_byte_arr, metadata)
except Exception as e:
print(
f"{filepath} FAILED WORKAROUND. Error: {e} Saving image without EXIF data.".replace(
"\n", " | "
)
)
img = r.content
f.write(img)
print("{} downloaded {}".format(filepath, r))
2023-01-20 04:19:37 +01:00
2023-09-20 22:10:06 +02:00
2023-09-14 00:05:12 +02:00
def get_single_image_data(image_id, mly_header):
req_url = "https://graph.mapillary.com/{}?fields=creator,thumb_original_url,altitude,make,model,camera_type,captured_at,compass_angle,geometry,exif_orientation,sequence".format(
image_id
)
2023-09-14 20:57:41 +02:00
r = session.get(req_url, headers=mly_header)
2023-09-14 00:05:12 +02:00
data = r.json()
print(data)
2023-09-14 00:05:12 +02:00
return data
2023-09-20 22:10:06 +02:00
2023-09-14 00:05:12 +02:00
def get_image_data_from_sequences(sequences_id, mly_header):
for i, sequence_id in enumerate(sequences_id):
url = "https://graph.mapillary.com/image_ids?sequence_id={}".format(sequence_id)
2023-09-14 00:05:12 +02:00
r = requests.get(url, headers=header)
data = r.json()
image_ids = data["data"]
2023-09-14 00:05:12 +02:00
total_image = len(image_ids)
print(
"{} images in sequence {} of {} - id : {}".format(
total_image, i + 1, len(sequences_id), sequence_id
)
)
print("getting images data")
2023-09-14 00:05:12 +02:00
for x in range(0, total_image):
image_id = image_ids[x]["id"]
2023-09-14 00:05:12 +02:00
image_data = get_single_image_data(image_id, mly_header)
image_data["sequence_id"] = sequence_id
2023-09-14 00:05:12 +02:00
yield image_data
2023-09-20 22:10:06 +02:00
2023-09-14 23:27:03 +02:00
def get_image_data_from_sequences__future(sequences_id, mly_header):
for i, sequence_id in enumerate(sequences_id):
url = "https://graph.mapillary.com/image_ids?sequence_id={}".format(sequence_id)
2023-09-14 23:27:03 +02:00
r = requests.get(url, headers=header)
data = r.json()
if data.get("data") == []:
print(
"Empty or wrong sequence {} of {} - id : {}".format(
i + 1, len(sequences_id), sequence_id
)
)
2023-09-19 22:03:33 +02:00
continue
image_ids = data["data"]
2023-09-14 23:27:03 +02:00
total_image = len(image_ids)
print(
"{} images in sequence {} of {} - id : {}".format(
total_image, i + 1, len(sequences_id), sequence_id
)
)
print("getting images data")
2023-09-14 23:27:03 +02:00
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {}
for x in range(0, total_image):
image_id = image_ids[x]["id"]
future_to_url[
executor.submit(get_single_image_data, image_id, mly_header)
] = image_id
2023-09-14 23:27:03 +02:00
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
image_data = future.result()
image_data["sequence_id"] = sequence_id
# print(image_data)
2023-09-14 23:27:03 +02:00
yield image_data
2023-09-20 22:10:06 +02:00
2023-09-12 14:31:10 +02:00
def write_exif(picture, img_metadata):
"""
2023-09-10 21:02:33 +02:00
Write exif metadata
"""
# {'thumb_original_url': 'https://scontent-cdg4-2.xx.fbcdn.net/m1/v/t6/An9Zy2SrH9vXJIF01QkBODyUbg7XSKfwL48UwHyvihSwvECGjVbG0vSw9uhxe2-Dq-k2eUcigb83buO6zo-7eVbykfp5aQIe1kgd-MJr66nU_H-o_mwBLZXgVbj5I_5WX-C9c6FxJruHkV962F228O0?ccb=10-5&oh=00_AfDOKD869DxL-4ZNCbVo8Rn29vsc0JyjMAU2ctx4aAFVMQ&oe=65256C25&_nc_sid=201bca',
2023-09-10 21:02:33 +02:00
# 'captured_at': 1603459736644, 'geometry': {'type': 'Point', 'coordinates': [2.5174596904057, 48.777089857534]}, 'id': '485924785946693'}
with writer.Writer(picture) as image:
image.add_artist(img_metadata)
image.add_camera_make(img_metadata)
image.add_camera_model(img_metadata)
2023-09-18 22:16:07 +02:00
image.add_datetimeoriginal(img_metadata)
image.add_lat_lon(img_metadata)
image.add_altitude(img_metadata)
image.add_direction(img_metadata)
2023-10-08 08:51:54 +02:00
image.add_img_projection(img_metadata)
image.apply()
updated_image = image.get_Bytes()
2023-09-17 20:02:59 +02:00
return updated_image
2023-09-20 22:10:06 +02:00
if __name__ == "__main__":
2023-09-20 21:28:32 +02:00
args = parse_args()
sequence_ids = args.sequence_ids if args.sequence_ids is not None else []
2023-09-20 12:21:14 +02:00
images_ids = args.image_ids
2023-01-20 04:19:37 +01:00
access_token = args.access_token
2023-09-14 10:12:26 +02:00
images_data = []
header = {"Authorization": "OAuth {}".format(access_token)}
2023-09-20 12:21:14 +02:00
if images_ids:
for image_id in images_ids:
image_data = get_single_image_data(image_id, header)
if "error" in image_data:
2023-09-20 12:21:14 +02:00
print("data : ", image_data)
print(
"something wrong happened ! Please check your image id and/or your connection"
)
2023-09-20 12:21:14 +02:00
sys.exit()
else:
sequence_ids.append(image_data.get("sequence"))
2023-01-20 04:19:37 +01:00
# for i,image_data in enumerate(get_image_data_from_sequences(sequence_ids, header)):
for i, image_data in enumerate(
get_image_data_from_sequences__future(sequence_ids, header)
):
2023-09-14 10:12:26 +02:00
if args.image_limit is not None and i >= args.image_limit:
2023-09-14 00:05:12 +02:00
break
if "error" in image_data:
print("data : ", image_data)
print(
"something wrong happened ! Please check your token and/or your connection"
)
sys.exit()
2023-09-14 00:05:12 +02:00
images_data.append(image_data)
# sys.exit()
2023-01-20 04:19:37 +01:00
print("downloading.. this process will take a while. please wait")
2023-09-15 20:13:42 +02:00
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
for i, image_data in enumerate(images_data):
2023-09-14 23:48:20 +02:00
# create a folder for each unique sequence ID to group images by sequence
path_destination = os.path.join(args.destination, args.username, image_data["sequence_id"])
if not os.path.exists(path_destination):
os.makedirs(path_destination)
date_time_image_filename = (
datetime.utcfromtimestamp(
int(image_data["captured_at"]) / 1000
).strftime("%Y-%m-%d_%HH%Mmn%Ss%f")[:-3]
+ ".jpg"
)
path = os.path.join(path_destination, date_time_image_filename)
2023-09-14 23:48:20 +02:00
img_metadata = writer.PictureMetadata(
capture_time=datetime.utcfromtimestamp(
int(image_data["captured_at"]) / 1000
),
artist=image_data["creator"]["username"],
camera_make=image_data["make"],
camera_model=image_data["model"],
longitude=image_data["geometry"]["coordinates"][0],
latitude=image_data["geometry"]["coordinates"][1],
picture_type=(
PictureType("equirectangular")
if image_data.get("camera_type") == "spherical"
or image_data.get("camera_type") == "equirectangular"
else PictureType("flat")
),
direction=image_data["compass_angle"],
altitude=image_data["altitude"],
2023-09-14 23:48:20 +02:00
)
# print("metadata: ", img_metadata)
# print("path: ", image_data)
2023-09-15 11:22:11 +02:00
image_exists = os.path.exists(path)
if not args.overwrite and image_exists:
print("{} already exists. Skipping ".format(path))
continue
executor.submit(
download,
url=image_data["thumb_original_url"],
filepath=path,
metadata=img_metadata,
)
# download(url=image_data['thumb_original_url'], filepath=path, metadata=img_metadata)