(Docker flask)gunicorn server failing when workers are set to 4-CodePudding

I am trying to deploy my flask app on a vertex ai endpoint. I have built my docker container but I am not able to connect to the flask app when I run the docker container locally as it crashes. I have previously deployed an app to a vertex ai endpoint and I have pretty much used the same Dockerfile as I have listed below.

Dockerfile

FROM python:3.8.8

WORKDIR /app

COPY ./app /app
COPY requirements.txt requirements.txt

RUN pip install -r requirements.txt

EXPOSE 8080
CMD gunicorn --bind=0.0.0.0:5005 --timeout=150 "app:app" -w 4

This is my Requirements.txt

pandas
flask
numpy
requests
Pillow
torch
opencv-python-headless
wget
Flask-Cors
torchvision
gunicorn
Ipython
psutil
PyYAML
tqdm
matplotlib
seaborn
gitpython
scipy

This is my app.py file

import pandas as pd
from flask import Flask,jsonify,request
import numpy as np
import os, io, requests
from PIL import Image
import torch, time, cv2, wget
import json
from flask_cors import CORS, cross_origin

app = Flask(__name__)

model_dir = './model/'
dir_list = os.listdir(model_dir)
if dir_list and dir_list[0].split(".")[-1] == "pt":
    weights = f'model/{dir_list[0]}'
    print(f"supplied weights found, loading weights - {weights}")
    model = torch.hub.load("ultralytics/yolov5", "custom", path=weights, 
force_reload=True)
else:
    print(f"loading default yolov5l weights")
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
model.eval()

def crop_image_into_four_parts(image):
    height, width, _ = image.shape
    first_part = image[:int(height / 2), :int(width / 2)]
    second_part = image[:int(height / 2), int(width / 2):]
    third_part = image[int(height / 2):, :int(width / 2)]
    four_part = image[int(height / 2):, int(width / 2):]
    return [first_part, second_part, third_part, four_part]

def update_detection_points(grid_detection, width, height):
    final_result = []
    for window in grid_detection:
        for result in grid_detection[window]:
            if window == 1:
                result['coordinates'][0]  = int(width / 2)
                result['coordinates'][2]  = int(width / 2)
            elif window == 2:
                result['coordinates'][1]  = int(height / 2)
                result['coordinates'][3]  = int(height / 2)
            elif window == 3:
                result['coordinates'][0]  = int(width / 2)
                result['coordinates'][1]  = int(height / 2)
                result['coordinates'][2]  = int(width / 2)
                result['coordinates'][3]  = int(height / 2)
            final_result.append(result)
    return final_result

@app.route('/predict',methods=['POST','GET'])
# @cross_origin(supports_credentials=True)
def predict():
    req = request.json.get('instances')
    if req[0] == 'detect':
        try:
            if 'http' in req[1]:
                img = Image.open(io.BytesIO(requests.get(req[1], stream=True).content))
            else:
                img = Image.open(io.BytesIO(open(req[1], 'rb').read()))

            t1 = time.time()
            results = model(img)
            print(f"*time taken for detection - {round(time.time()-t1, 3)} sec*")
            tags = json.loads(results.pandas().xyxy[0].to_json(orient="records"))
            lis = []
            if len(tags):
                for i in tags:
                    lis.append({'label': i['name'], 'score': round(i['confidence'], 2), 'coordinates':[int(i['xmin']),int(i['ymin']),int(i['xmax']),int(i['ymax'])]})
                                   
            #os.remove(source)
            print(lis)
            torch.cuda.empty_cache()
            return jsonify({'result': lis, 'status': 200})

        except Exception as e:
            print(e)
            res = dict()
            res['status'] = 400
            res['result'] = []
            res['error'] = e
            return jsonify(res)

    elif req[0] == 'detect_by_part':
        try:
            if 'http' in req[1]:
                image_file = wget.download(req[1])
            image = Image.open(io.BytesIO(open(image_file, 'rb').read()))
            height, width, _ = image.shape
            crop_images = crop_image_into_four_parts(image)
            grid_detections = {}
            for inx, img in enumerate(crop_images):
                results = model(img)
                lis = []
                tags = json.loads(results.pandas().xyxy[0].to_json(orient="records"))
                if tags:
                    for i in tags:
                        lis.append({'label': i['name'], 'score': round(float(i['confidence']), 2),
                                'coordinates': [int(i['xmin']), int(i['ymin']), int(i['xmax']), int(i['ymax'])]})
                grid_detections[inx] = lis
            lis = update_detection_points(grid_detections, width, height)
            print("result:::", lis)
            torch.cuda.empty_cache()
            return jsonify({'result': lis, 'status': 200})
        
        except Exception as e:
            print(e)
            res = dict()
            res['status'] = 400
            res['result'] = []
            res['error'] = 'error'
            return jsonify(res)

@app.route('/healthz')
def healthz():
    return "OK"

if __name__=='__main__':
    app.run(host='0.0.0.0', debug=True, port=8080)

Here is the error traceback

[2023-01-24 06:47:26  0000] [7] [INFO] Starting gunicorn 20.1.0
[2023-01-24 06:47:26  0000] [7] [INFO] Listening at: http://0.0.0.0:5005 (7)
[2023-01-24 06:47:26  0000] [7] [INFO] Using worker: sync
[2023-01-24 06:47:26  0000] [9] [INFO] Booting worker with pid: 9
[2023-01-24 06:47:27  0000] [10] [INFO] Booting worker with pid: 10
[2023-01-24 06:47:27  0000] [11] [INFO] Booting worker with pid: 11
[2023-01-24 06:47:27  0000] [12] [INFO] Booting worker with pid: 12
[2023-01-24 06:47:41  0000] [11] [ERROR] Exception in worker process
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
    worker.init_process()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
    self.load_wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
    self.wsgi = self.app.wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
    self.callable = self.load()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
    return self.load_wsgiapp()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
    return util.import_app(self.app_uri)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
    mod = importlib.import_module(module)
  File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
  File "<frozen importlib._bootstrap>", line 991, in _find_and_load
  File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/app/app.py", line 20, in <module>
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
    repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
    os.makedirs(hub_dir)
  File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
    mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41  0000] [12] [ERROR] Exception in worker process
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
    worker.init_process()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
    self.load_wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
    self.wsgi = self.app.wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
    self.callable = self.load()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
    return self.load_wsgiapp()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
    return util.import_app(self.app_uri)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
    mod = importlib.import_module(module)
  File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
  File "<frozen importlib._bootstrap>", line 991, in _find_and_load
  File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/app/app.py", line 20, in <module>
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
    repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
    os.makedirs(hub_dir)
  File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
    mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41  0000] [9] [ERROR] Exception in worker process
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 589, in spawn_worker
    worker.init_process()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 134, in init_process
    self.load_wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/workers/base.py", line 146, in load_wsgi
    self.wsgi = self.app.wsgi()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 67, in wsgi
    self.callable = self.load()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 58, in load
    return self.load_wsgiapp()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 48, in load_wsgiapp
    return util.import_app(self.app_uri)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/util.py", line 359, in import_app
    mod = importlib.import_module(module)
  File "/usr/local/lib/python3.8/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
  File "<frozen importlib._bootstrap>", line 991, in _find_and_load
  File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/app/app.py", line 20, in <module>
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
    repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
    os.makedirs(hub_dir)
  File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
    mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'
[2023-01-24 06:47:41  0000] [11] [INFO] Worker exiting (pid: 11)
[2023-01-24 06:47:41  0000] [12] [INFO] Worker exiting (pid: 12)
[2023-01-24 06:47:41  0000] [9] [INFO] Worker exiting (pid: 9)
loading default yolov5l weights
loading default yolov5l weights
loading default yolov5l weights
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 209, in run
    self.sleep()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 357, in sleep
    ready = select.select([self.PIPE[0]], [], [], 1.0)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 242, in handle_chld
    self.reap_workers()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 525, in reap_workers
    raise HaltServer(reason, self.WORKER_BOOT_ERROR)
gunicorn.errors.HaltServer: <HaltServer 'Worker failed to boot.' 3>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/bin/gunicorn", line 8, in <module>
    sys.exit(run())
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/wsgiapp.py", line 67, in run
    WSGIApplication("%(prog)s [OPTIONS] [APP_MODULE]").run()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 231, in run
    super().run()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/app/base.py", line 72, in run
    Arbiter(self).run()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 229, in run
    self.halt(reason=inst.reason, exit_status=inst.exit_status)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 342, in halt
    self.stop()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 393, in stop
    time.sleep(0.1)
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 242, in handle_chld
    self.reap_workers()
  File "/usr/local/lib/python3.8/site-packages/gunicorn/arbiter.py", line 525, in reap_workers
    raise HaltServer(reason, self.WORKER_BOOT_ERROR)
gunicorn.errors.HaltServer: <HaltServer 'Worker failed to boot.' 3>

There is no issue when I run the app.py file locally as flask app but when I run it using the gunicorn command it fails, specifically when I set the workers to 4. Maybe it has something to do with yolov5 as I have seen many os errors saying either directory not empty or file exists. Hopefully I have given all the files needed to replicate it locally if needed. Thank you in advance.

CodePudding user response：

So the error is not from the worker, the errors comes from your code

these lines

model_dir = './model/'
dir_list = os.listdir(model_dir)
if dir_list and dir_list[0].split(".")[-1] == "pt":
    weights = f'model/{dir_list[0]}'
    print(f"supplied weights found, loading weights - {weights}")
    model = torch.hub.load("ultralytics/yolov5", "custom", path=weights, 
force_reload=True)
else:
    print(f"loading default yolov5l weights")
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
model.eval()

so adding worker arguments to the gunicorn means the gunicorn will consume your app.py 4 times (1 for each workers). As above code is on the main level, they will be executed at the same time while gunicorn workers are spawning.

The error source comes from torch.hub.load(). It's creating the same folder /root/.cache/torch/hub.

 File "/app/app.py", line 20, in <module>
    model = torch.hub.load("ultralytics/yolov5", "yolov5l", force_reload=True)
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 539, in load
    repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
  File "/usr/local/lib/python3.8/site-packages/torch/hub.py", line 178, in _get_cache_or_reload
    os.makedirs(hub_dir)
  File "/usr/local/lib/python3.8/os.py", line 223, in makedirs
    mkdir(name, mode)
FileExistsError: [Errno 17] File exists: '/root/.cache/torch/hub'

thats why, if you run a single worker it wont error because it only executed once

How to solve this is you probably need to handle the pytorch cache when loading the models because if not you can only run a single worker per load.