I have the following code for uploading s3 using MultipartUpload.
import logging
import boto3
class UploadS3:
def __init__(self, bucket, prefix):
self.s3 = boto3.resource('s3')
self.bucket = bucket
self.prefix = prefix
def start(self, key):
'''Start to upload a new file'''
self.part_no = 1
self.parts = []
key_path = f'{self.prefix}/{key}'
self.s3obj = self.s3.Object(self.bucket, key_path)
self.mpu = self.s3obj.initiate_multipart_upload()
self.buffer = bytearray()
def upload(self, chunk):
'''Upload a chunk'''
if len(self.buffer) >= 5_000_000:
self._upload_buffer()
self.buffer = chunk
def end(self, part_info={}):
if len(self.buffer):
self._upload_buffer()
part_info['Parts'] = self.parts
mpu_result = self.mpu.complete(MultipartUpload=part_info)
logging.info(f'Upload result: {mpu_result}')
def _upload_buffer(self):
self.part = self.mpu.Part(self.part_no)
print(f'buffer len: {len(self.buffer)}')
resp = self.part.upload(Body=self.buffer)
print({'PartNumber': self.part_no, 'ETag': resp['ETag']})
self.parts.append({'PartNumber': self.part_no, 'ETag': resp['ETag']})
self.part_no = 1
self.buffer = bytearray()
And I created the following test code:
upload_s3 = UploadS3(BUCKET, PREFIX)
key = 'key2'
upload_s3.start(key)
upload_s3.upload(b'0' * 1_000_000)
upload_s3.upload(b'1' * 1_000_000)
upload_s3.upload(b'2' * 1_000_000)
upload_s3.upload(b'3' * 1_000_000)
upload_s3.upload(b'4' * 999_999)
upload_s3.upload(b'abcde')
upload_s3.upload(b'12345')
upload_s3.end({})
However, it got the following error. The length of the first part is 5000004 and the second (last) part is 5, which doesn't need to be over 5M?
buffer len: 5000004
{'PartNumber': 1, 'ETag': '"e616f253def9510e3be2af0854e4c992"'}
buffer len: 5
{'PartNumber': 2, 'ETag': '"db44331bface5c8678770426baf73bc2"'}
Traceback (most recent call last):
File "test1.py", line 35, in <module>
main()
File "test1.py", line 31, in main
upload_s3.end({})
File "/home/x/upload_s3.py", line 31, in end
mpu_result = self.mpu.complete(MultipartUpload=part_info)
File "/apps/external/4/anaconda3/lib/python3.6/site-packages/boto3/resources/factory.py", line 520, in do_action
response = action(self, *args, **kwargs)
File "/apps/external/4/anaconda3/lib/python3.6/site-packages/boto3/resources/action.py", line 83, in __call__
response = getattr(parent.meta.client, operation_name)(*args, **params)
File "/apps/external/4/anaconda3/lib/python3.6/site-packages/botocore/client.py", line 386, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/apps/external/4/anaconda3/lib/python3.6/site-packages/botocore/client.py", line 705, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (EntityTooSmall) when calling the CompleteMultipartUpload operation: Your proposed upload is smaller than the minimum allowed size
CodePudding user response:
As of writing this answer, the S3 multipart upload limitations page has the following table:
Item | Specification |
---|---|
Maximum object size | 5 TB |
Maximum number of parts per upload | 10,000 |
Part numbers | 1 to 10,000 (inclusive) |
Part size | 5 MB to 5 GB. There is no minimum size limit on the last part of your multipart upload. |
Maximum number of parts returned for a list parts request | 1000 |
Maximum number of multipart uploads returned in a list multipart uploads request | 1000 |
However, there is a subtle mistake. It says 5 MB instead of 5 MiB (and possibly 5 GB should actually be 5 GiB).
Since you split the parts every 5 000 000
bytes (which are 5 MB but "only" ~4.77 MiB) both the first and second parts are smaller than the minimum size.
You should instead split the parts every 5 242 880
(5 * 1024 ** 2
) bytes.
I submitted a pull request on the S3 docs page.