I am trying to deploy my flask app on a vertex ai endpoint. I have built my docker container but I am not able to connect to the flask app when I run the docker container locally as it crashes. I have previously deployed an app to a vertex ai endpoint and I have pretty much used the same Dockerfile as I have listed below.
Dockerfile
FROM python:3.8.8
WORKDIR /app
COPY ./app /app
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt
EXPOSE 8080
CMD gunicorn --bind=0.0.0.0:5005 --timeout=150 "app:app" -w 4
This is my Requirements.txt
pandas
flask
numpy
requests
Pillow
torch
opencv-python-headless
wget
Flask-Cors
torchvision
gunicorn
Ipython
psutil
PyYAML
tqdm
matplotlib
seaborn
gitpython
scipy
This is my app.py file
import pandas as pd
from flask import Flask,jsonify,request
import numpy as np
import os, io, requests
from PIL import Image
import torch, time, cv2, wget
import json
from flask_cors import CORS, cross_origin
app = Flask(__name__)
model_dir = './model/'
dir_list = os.listdir(model_dir)
if dir_list and dir_list[0].split(".")[-1] == "pt":
weights = f'model/{dir_list[0]}'
print(f"supplied weights found, loading weights - {weights}")
model = torch.hub.load("ultralytics/yolov5", "custom", path=weights,
force_reload=True)
else:
print(f"loading default yolov5l weights")
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
model.eval()
def crop_image_into_four_parts(image):
height, width, _ = image.shape
first_part = image[:int(height / 2), :int(width / 2)]
second_part = image[:int(height / 2), int(width / 2):]
third_part = image[int(height / 2):, :int(width / 2)]
four_part = image[int(height / 2):, int(width / 2):]
return [first_part, second_part, third_part, four_part]
def update_detection_points(grid_detection, width, height):
final_result = []
for window in grid_detection:
for result in grid_detection[window]:
if window == 1:
result['coordinates'][0] = int(width / 2)
result['coordinates'][2] = int(width / 2)
elif window == 2:
result['coordinates'][1] = int(height / 2)
result['coordinates'][3] = int(height / 2)
elif window == 3:
result['coordinates'][0] = int(width / 2)
result['coordinates'][1] = int(height / 2)
result['coordinates'][2] = int(width / 2)
result['coordinates'][3] = int(height / 2)
final_result.append(result)
return final_result
@app.route('/predict',methods=['POST','GET'])
# @cross_origin(supports_credentials=True)
def predict():
req = request.json.get('instances')
if req[0] == 'detect':
try:
if 'http' in req[1]:
img = Image.open(io.BytesIO(requests.get(req[1], stream=True).content))
else:
img = Image.open(io.BytesIO(open(req[1], 'rb').read()))
t1 = time.time()
results = model(img)
print(f"*time taken for detection - {round(time.time()-t1, 3)} sec*")
tags = json.loads(results.pandas().xyxy[0].to_json(orient="records"))
lis = []
if len(tags):
for i in tags:
lis.append({'label': i['name'], 'score': round(i['confidence'], 2), 'coordinates':[int(i['xmin']),int(i['ymin']),int(i['xmax']),int(i['ymax'])]})
#os.remove(source)
print(lis)
torch.cuda.empty_cache()
return jsonify({'result': lis, 'status': 200})
except Exception as e:
print(e)
res = dict()
res['status'] = 400
res['result'] = []
res['error'] = e
return jsonify(res)
elif req[0] == 'detect_by_part':
try:
if 'http' in req[1]:
image_file = wget.download(req[1])
image = Image.open(io.BytesIO(open(image_file, 'rb').read()))
height, width, _ = image.shape
crop_images = crop_image_into_four_parts(image)
grid_detections = {}
for inx, img in enumerate(crop_images):
results = model(img)
lis = []
tags = json.loads(results.pandas().xyxy[0].to_json(orient="records"))
if tags:
for i in tags:
lis.append({'label': i['name'], 'score': round(float(i['confidence']), 2),
'coordinates': [int(i['xmin']), int(i['ymin']), int(i['xmax']), int(i['ymax'])]})
grid_detections[inx] = lis
lis = update_detection_points(grid_detections, width, height)
print("result:::", lis)
torch.cuda.empty_cache()
return jsonify({'result': lis, 'status': 200})
except Exception as e:
print(e)
res = dict()
res['status'] = 400
res['result'] = []
res['error'] = 'error'
return jsonify(res)
@app.route('/healthz')
def healthz():
return "OK"
if __name__=='__main__':
app.run(host='0.0.0.0', debug=True, port=8080)
Here is the error traceback
[2023-01-24 06:47:26 0000] [7] [INFO] Starting gunicorn 20.1.0
[2023-01-24 06:47:26 0000] [7] [INFO] Listening at: http://0.0.0.0:5005 (7)
[2023-01-24 06:47:26 0000] [7] [INFO] Using worker: sync
[2023-01-24 06:47:26 0000] [9] [INFO] Booting worker with pid: 9
[2023-01-24 06:47:27 0000] [10] [INFO] Booting worker with pid: 10
[2023-01-24 06:47:27 0000] [11] [INFO] Booting worker with pid: 11
[2023-01-24 06:47:27 0000] [12] [INFO] Booting worker with pid: 12
[2023-01-24 06:47:41 0000] [11] [ERROR] Exception in worker process
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
worker.init_process()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
self.load_wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
self.wsgi = self.app.wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
self.callable = self.load()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
return self.load_wsgiapp()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
return util.import_app(self.app_uri)
File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
mod = importlib.import_module(module)
File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 783, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/app/app.py", line 20, in <module>
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
os.makedirs(hub_dir)
File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41 0000] [12] [ERROR] Exception in worker process
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
worker.init_process()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
self.load_wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
self.wsgi = self.app.wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
self.callable = self.load()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
return self.load_wsgiapp()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
return util.import_app(self.app_uri)
File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
mod = importlib.import_module(module)
File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 783, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/app/app.py", line 20, in <module>
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
os.makedirs(hub_dir)
File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41 0000] [9] [ERROR] Exception in worker process
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
worker.init_process()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
self.load_wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
self.wsgi = self.app.wsgi()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
self.callable = self.load()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
return self.load_wsgiapp()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
return util.import_app(self.app_uri)
File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
mod = importlib.import_module(module)
File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 783, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/app/app.py", line 20, in <module>
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
os.makedirs(hub_dir)
File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41 0000] [11] [INFO] Worker exiting (pid: 11)
[2023-01-24 06:47:41 0000] [12] [INFO] Worker exiting (pid: 12)
[2023-01-24 06:47:41 0000] [9] [INFO] Worker exiting (pid: 9)
loading default yolov5l weights
loading default yolov5l weights
loading default yolov5l weights
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 209, in run
self.sleep()
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 357, in sleep
ready = select.select([self.PIPE[0]], [], [], 1.0)
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 242, in handle_chld
self.reap_workers()
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 525, in reap_workers
raise HaltServer(reason, self.WORKER_BOOT_ERROR)
gunicorn.errors.HaltServer: <HaltServer 'Worker failed to boot.' 3>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/gunicorn", line 8, in <module>
sys.exit(run())
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 67, in run
WSGIApplication("%(prog)s [OPTIONS] [APP_MODULE]").run()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 231, in run
super().run()
File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 72, in run
Arbiter(self).run()
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 229, in run
self.halt(reason=inst.reason, exit_status=inst.exit_status)
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 342, in halt
self.stop()
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 393, in stop
time.sleep(0.1)
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 242, in handle_chld
self.reap_workers()
File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 525, in reap_workers
raise HaltServer(reason, self.WORKER_BOOT_ERROR)
gunicorn.errors.HaltServer: <HaltServer 'Worker failed to boot.' 3>
There is no issue when I run the app.py file locally as flask app but when I run it using the gunicorn command it fails, specifically when I set the workers to 4. Maybe it has something to do with yolov5 as I have seen many os errors saying either directory not empty or file exists. Hopefully I have given all the files needed to replicate it locally if needed. Thank you in advance.
CodePudding user response:
So the error is not from the worker, the errors comes from your code
these lines
model_dir = './model/'
dir_list = os.listdir(model_dir)
if dir_list and dir_list[0].split(".")[-1] == "pt":
weights = f'model/{dir_list[0]}'
print(f"supplied weights found, loading weights - {weights}")
model = torch.hub.load("ultralytics/yolov5", "custom", path=weights,
force_reload=True)
else:
print(f"loading default yolov5l weights")
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
model.eval()
so adding worker arguments to the gunicorn means the gunicorn will consume your app.py
4 times (1 for each workers). As above code is on the main level, they will be executed at the same time while gunicorn workers are spawning.
The error source comes from torch.hub.load()
. It's creating the same folder /root/.cache/torch/hub
.
File "/app/app.py", line 20, in <module>
model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
os.makedirs(hub_dir)
File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
thats why, if you run a single worker it wont error because it only executed once
How to solve this is you probably need to handle the pytorch cache when loading the models because if not you can only run a single worker per load.