I am trying to count all the fles in Azure's BlobStorage with path: container_one/year/month/day/hour
What I tried:
string path = "container_one/year/month/day/hour";
int totalItems = 0;
BlobContainerClient blobFolder = new BlobServiceClient(connectionString).GetBlobContainerClient(path);
foreach(var blob in blobFolder){
totalItems ;
}
Console.WriteLine(totalItems);
Resulted in:
System.AggregateException: 'One or more errors occurred. (The requested URI does not represent any resource on the server.
What am I doing wrong ? And how could I do it ? I am sure that the provided path does exist on the server.
CodePudding user response:
The working approach I found:
int totalCount = 0;
string path = $"{container_one}/{year}/{month}/{day}/{hour}/";
var blobFolders = _blobContainerClient.GetBlobsAsync();
await foreach (var file in blobFolders)
{
if (file.Name.Contains(path))
{
totalCount ;
}
}
return totalCount;
Remarks
Doing it this way costs a lot of performance, but I don't see any other options. If anyone has found a solution, please share it with me!
CodePudding user response:
To expand on the OP's self-answer and respond to their request for how to improve performance, I'll share some code from a Linqpad file of mine that can iterate through blobs in Azure Blob Storage at a rate I last recorded as 6.7 million blobs in 6m38s (or about 1 million blobs per minute, or 16,666 blobs per second).
Getting blazing blob-iteration speeds like that is doable, but in my case was possible due to 2 things:
- I'm using the Content-Addressable-Storage practice, whereby each blob is immutable and the blob's name is the base-16 representation of its SHA-256 hash.
- So the statistical distribution of all blob-names is evenly shared into the 16 base16 character "bins"
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
.
- So the statistical distribution of all blob-names is evenly shared into the 16 base16 character "bins"
- The other catch was that because iterating blobs is somewhat chatty it means that network-latency between your computer and Blob Storage will drastically slow things down regardless of your internet connection's throughput speed (so using a 100mbps connection with a 1ms latency is far far far more preferable to using a 1Gbps connection with a 100ms latency to your Azure storage account).
- The workaround is easy though: just spin-up a VM in Azure (you'll only need to rent a VM for an hour or so, so you can safely configure a beefy machine that might cost $1000/mo but will only cost you less than $5 because you only need it for a tiny fraction of a full calendar month).
As I mentioned, the fact that my blob-names are evenly (and randomly) distributed through a large number of "prefix bins" is central to why it's able to iterate over them so fast, however provided you can partition your blob-names into evenly-sized bins based on their prefix (including the container name and virtual-path) then it should run just as fast.
- Copy and paste the below code into a Linqpad script.
- You'll need to add a NuGet reference to
Azure.Storage.Blobs
(it was written against12.8.4
, you'll probably need to update and address any breaking changes). - This script writes all blob URIs (and their Azure-generated MD5 hash) to a binary file for consumption by other processes.
static readonly DirectoryInfo _outputDirectory = new DirectoryInfo( Path.Combine(
Environment.GetFolderPath( Environment.SpecialFolder.Desktop ),
"BlobsDump"
) );
async Task Main() {
_outputDirectory.Create();
// Indexes 8,730,343 blobs in 6m38 seconds (impressive - compared to the few hours it took earlier)
// UPDATE: Using 2-character prefixes brings total time down to 1m54s to enumerate 9m blobs, wow!
BlobContainerClient cc = CreateBlobClient();
await ListBlobsAsync( cc );
}
private static BlobContainerClient CreateBlobClient()
{
const String cs = @"DefaultEndpointsProtocol=https;AccountName=mindyourownbusiness;AccountKey=werenostrangerstoloveyouknowtherulesandsodoiafullcommitmentswhatimthinkingofyouwouldntgetthisfromanyotherguy;BlobEndpoint=https://.blob.core.windows.net/;TableEndpoint=https://.table.core.windows.net/;";
BlobServiceClient c = new BlobServiceClient(cs);
BlobContainerClient cc = c.GetBlobContainerClient("container-name");
return cc;
}
private static readonly Char[] _hexDigits = new[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
private static readonly IReadOnlyList<String> _blobNamePrefixes = _hexDigits.SelectMany( hd0 => _hexDigits.Select( hd1 => hd0.ToString() hd1.ToString() ) ).ToList();
private static async Task ListBlobsAsync( BlobContainerClient cc )
{
ConcurrentDictionary<String,HashSet<String>> blobNamesPerPrefix = new ConcurrentDictionary<String,HashSet<String>>();
foreach( String prefix in _blobNamePrefixes )
{
blobNamesPerPrefix[prefix] = new HashSet<String>();
}
//
Task reportProgressTask = ReportProgressAsync( blobNamesPerPrefix );
List<Task> tasks = _blobNamePrefixes.Select( prefix => ListBlobsAsync( cc: cc, prefix: prefix, dict: blobNamesPerPrefix ) ).ToList();
await Task.WhenAll( tasks ).ConfigureAwait(false);
Int32 total = blobNamesPerPrefix.Values.Sum( s => s.Count );
}
const Double _lastKnownBlobCountApprox = 8925524; // As of 2020-09-07.
private static async Task ReportProgressAsync( ConcurrentDictionary<String,HashSet<String>> countsSoFar, CancellationToken ct = default )
{
var pb = new Util.ProgressBar( "Blobs indexed" );
pb.Dump();
while( true )
{
Int32 total = countsSoFar.Values.Sum( v => v.Count );
pb.Fraction = (Double)total / _lastKnownBlobCountApprox;
pb.Caption = ( $"{total:N0} blobs observed." );
await Task.Delay( 250 );
}
}
private static readonly UTF8Encoding _utf8NoBom = new UTF8Encoding( encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true );
private static async Task ListBlobsAsync( BlobContainerClient cc, String prefix, ConcurrentDictionary<String,HashSet<String>> dict )
{
HashSet<String> blobs = dict[prefix];
const Int32 ONE_MEGABYTE = 1 * 1024 * 1024;
String outputFileName = Path.Combine( _outputDirectory.FullName, prefix ".dat" );
using( FileStream fs = new FileStream( outputFileName, FileMode.CreateNew, FileAccess.Write, FileShare.None, bufferSize: ONE_MEGABYTE, options: FileOptions.SequentialScan | FileOptions.Asynchronous ) )
using( MemoryStream ms = new MemoryStream( capacity: ONE_MEGABYTE ) )
using( BinaryWriter wtr = new BinaryWriter( ms, _utf8NoBom ) )
{
Stopwatch sw = Stopwatch.StartNew();
await ListBlobsInnerAsync( cc, prefix, blobs, wtr, fs, ms ).ConfigureAwait(false);
( $"Completed list with prefix \"{prefix}\". Blob count: {blobs.Count:N0}. Took {sw.ElapsedMilliseconds:N0}ms." ).Dump();
}
}
private static async Task ListBlobsInnerAsync( BlobContainerClient cc, String prefix, HashSet<String> blobs, BinaryWriter wtr, FileStream fs, MemoryStream ms )
{
Int32 i = 0;
String? continationToken = null;
do
{
System.Collections.Generic.IAsyncEnumerable<Azure.Page<BlobItem>> segment = cc.GetBlobsAsync( prefix: prefix ).AsPages( continationToken );
await foreach( Azure.Page<BlobItem>? page in segment.ConfigureAwait(false) )
{
continationToken = page.ContinuationToken;
if( page.Values.Last().Name[0] > prefix[0] ) break;
lock( blobs )
{
foreach( BlobItem bi in page.Values )
{
if( blobs.Add( bi.Name ) )
{
WriteBlobLine( ref i, bi, wtr );
}
}
}
// Flush:
wtr.Flush();
ms.Flush();
ms.Position = 0;
await ms.CopyToAsync( fs ).ConfigureAwait(false);
await fs.FlushAsync().ConfigureAwait(false);
ms.Position = 0;
ms.SetLength( 0 );
wtr.Seek( 0, SeekOrigin.Begin );
}
}
while( !String.IsNullOrWhiteSpace( continationToken ) );
}
private static void WriteBlobLine( ref Int32 i, BlobItem bi, BinaryWriter wtr )
{
wtr.Write( i );
wtr.Write( bi.Name ); // Length-prefixed string.
if( bi.Properties.ContentHash != null && bi.Properties.ContentHash.Length == 16 )
{
wtr.Write( bi.Properties.ContentHash.Length );
wtr.Write( bi.Properties.ContentHash );
}
else
{
wtr.Write( 0 );
}
i ;
}
CodePudding user response:
The method .GetBlobContainerClient()
has many overloads, the one which you are trying to call requires to pass in a Uri
so what you should be doing is .GetBlobContainerClient(new Uri(path))
.