In an attempt to profile & optimize a caching algorithm, I got stuck at something I don't understand.
Here is the hot-spot of the perf's report (in Ubuntu 18.04 LTS and g 7.5):
How does just a "mov" operatiaon between rax and rdx registers cause ~30% of total run-time of program? It's not a register-heavy program (an LRU-cache approximation algorithm that is doing ~50million lookups per second at max and this is around 400MB/s throughput(and certainly not faster than RAM for bigger key-value pairs) which should not be related to register bandwidth at all)
Test system's CPU is FX8150 and has 1 channel memory attached with 9GB/s bandwidth which is way higher than this single-thread cache can achieve for just "int" key "int" value pairs. So I guess it should be safe to leave RAM out of this problem. Also the mov instruction looks like a part of std::unordered_map's lookup operations. I know this CPU is old but not really ancient-old so perhaps compiler is not using the right instruction here due to some "support for old CPU" issue?
Source code to reproduce the hot-spot:
#ifndef LRUCLOCKCACHE_H_
#define LRUCLOCKCACHE_H_
#include<vector>
#include<algorithm>
#include<unordered_map>
#include<functional>
#include<mutex>
#include<unordered_map>
/* LRU-CLOCK-second-chance implementation
*
* LruKey: type of key (std::string, int, char, size_t, objects)
* LruValue: type of value that is bound to key (same as above)
* ClockHandInteger: just an optional optimization to reduce memory consumption when cache size is equal to or less than 255,65535,4B-1,...
*/
template< typename LruKey, typename LruValue,typename ClockHandInteger=size_t>
class LruClockCache
{
public:
// allocates circular buffers for numElements number of cache slots
// readMiss: cache-miss for read operations. User needs to give this function
// to let the cache automatically get data from backing-store
// example: [&](MyClass key){ return redis.get(key); }
// takes a LruKey as key, returns LruValue as value
// writeMiss: cache-miss for write operations. User needs to give this function
// to let the cache automatically set data to backing-store
// example: [&](MyClass key, MyAnotherClass value){ redis.set(key,value); }
// takes a LruKey as key and LruValue as value
LruClockCache(ClockHandInteger numElements,
const std::function<LruValue(LruKey)> & readMiss,
const std::function<void(LruKey,LruValue)> & writeMiss):size(numElements),loadData(readMiss),saveData(writeMiss)
{
ctr = 0;
// 50% phase difference between eviction and second-chance hands of the "second-chance" CLOCK algorithm
ctrEvict = numElements/2;
//loadData=readMiss;
//saveData=writeMiss;
// initialize circular buffers
for(ClockHandInteger i=0;i<numElements;i )
{
valueBuffer.push_back(LruValue());
chanceToSurviveBuffer.push_back(0);
isEditedBuffer.push_back(0);
keyBuffer.push_back(LruKey());
}
}
// get element from cache
// if cache doesn't find it in circular buffers,
// then cache gets data from backing-store
// then returns the result to user
// then cache is available from RAM on next get/set access with same key
inline
const LruValue get(const LruKey & key) noexcept
{
return accessClock2Hand(key,nullptr);
}
// only syntactic difference
inline
const std::vector<LruValue> getMultiple(const std::vector<LruKey> & key) noexcept
{
const int n = key.size();
std::vector<LruValue> result(n);
for(int i=0;i<n;i )
{
result[i]=accessClock2Hand(key[i],nullptr);
}
return result;
}
// thread-safe but slower version of get()
inline
const LruValue getThreadSafe(const LruKey & key) noexcept
{
std::lock_guard<std::mutex> lg(mut);
return accessClock2Hand(key,nullptr);
}
// set element to cache
// if cache doesn't find it in circular buffers,
// then cache sets data on just cache
// writing to backing-store only happens when
// another access evicts the cache slot containing this key/value
// or when cache is flushed by flush() method
// then returns the given value back
// then cache is available from RAM on next get/set access with same key
inline
void set(const LruKey & key, const LruValue & val) noexcept
{
accessClock2Hand(key,&val,1);
}
// thread-safe but slower version of set()
inline
void setThreadSafe(const LruKey & key, const LruValue & val) noexcept
{
std::lock_guard<std::mutex> lg(mut);
accessClock2Hand(key,&val,1);
}
// use this before closing the backing-store to store the latest bits of data
void flush()
{
for (auto mp = mapping.cbegin(); mp != mapping.cend() /* not hoisted */; /* no increment */)
{
if (isEditedBuffer[mp->second] == 1)
{
isEditedBuffer[mp->second]=0;
auto oldKey = keyBuffer[mp->second];
auto oldValue = valueBuffer[mp->second];
saveData(oldKey,oldValue);
mapping.erase(mp ); // or "it = m.erase(it)" since C 11
}
else
{
mp;
}
}
}
// CLOCK algorithm with 2 hand counters (1 for second chance for a cache slot to survive, 1 for eviction of cache slot)
// opType=0: get
// opType=1: set
LruValue const accessClock2Hand(const LruKey & key,const LruValue * value, const bool opType = 0)
{
// check if it is a cache-hit (in-cache)
typename std::unordered_map<LruKey,ClockHandInteger>::iterator it = mapping.find(key);
if(it!=mapping.end())
{
chanceToSurviveBuffer[it->second]=1;
if(opType == 1)
{
isEditedBuffer[it->second]=1;
valueBuffer[it->second]=*value;
}
return valueBuffer[it->second];
}
else // could not found key in cache, so searching in circular-buffer starts
{
long long ctrFound = -1;
LruValue oldValue;
LruKey oldKey;
while(ctrFound==-1)
{
// eviction hand lowers the "chance" status down if its 1 but slot is saved from eviction
if(chanceToSurviveBuffer[ctr]>0)
{
chanceToSurviveBuffer[ctr]=0;
}
// circular buffer has no bounds
ctr ;
if(ctr>=size)
{
ctr=0;
}
// unlucky slot is selected for eviction
if(chanceToSurviveBuffer[ctrEvict]==0)
{
ctrFound=ctrEvict;
oldValue = valueBuffer[ctrFound];
oldKey = keyBuffer[ctrFound];
}
// circular buffer has no bounds
ctrEvict ;
if(ctrEvict>=size)
{
ctrEvict=0;
}
}
// eviction algorithm start
if(isEditedBuffer[ctrFound] == 1)
{
// if it is "get"
if(opType==0)
{
isEditedBuffer[ctrFound]=0;
}
saveData(oldKey,oldValue);
// "get"
if(opType==0)
{
const LruValue && loadedData = loadData(key);
mapping.erase(keyBuffer[ctrFound]);
valueBuffer[ctrFound]=loadedData;
chanceToSurviveBuffer[ctrFound]=0;
mapping.emplace(key,ctrFound);
keyBuffer[ctrFound]=key;
return loadedData;
}
else /* "set" */
{
mapping.erase(keyBuffer[ctrFound]);
valueBuffer[ctrFound]=*value;
chanceToSurviveBuffer[ctrFound]=0;
mapping.emplace(key,ctrFound);
keyBuffer[ctrFound]=key;
return *value;
}
}
else // not edited
{
// "set"
if(opType == 1)
{
isEditedBuffer[ctrFound]=1;
}
// "get"
if(opType == 0)
{
const LruValue && loadedData = loadData(key);
mapping.erase(keyBuffer[ctrFound]);
valueBuffer[ctrFound]=loadedData;
chanceToSurviveBuffer[ctrFound]=0;
mapping.emplace(key,ctrFound);
keyBuffer[ctrFound]=key;
return loadedData;
}
else // "set"
{
mapping.erase(keyBuffer[ctrFound]);
valueBuffer[ctrFound]=*value;
chanceToSurviveBuffer[ctrFound]=0;
mapping.emplace(key,ctrFound);
keyBuffer[ctrFound]=key;
return *value;
}
}
}
}
private:
ClockHandInteger size;
std::mutex mut;
std::unordered_map<LruKey,ClockHandInteger> mapping;
std::vector<LruValue> valueBuffer;
std::vector<unsigned char> chanceToSurviveBuffer;
std::vector<unsigned char> isEditedBuffer;
std::vector<LruKey> keyBuffer;
const std::function<LruValue(LruKey)> loadData;
const std::function<void(LruKey,LruValue)> saveData;
ClockHandInteger ctr;
ClockHandInteger ctrEvict;
};
#endif /* LRUCLOCKCACHE_H_ */
Test program:
#include <iostream>
#include <fstream>
#include <mutex>
#include <map>
#include <time.h>
#include <math.h>
#include "LruClockCache.h"
int main()
{
// mandelbrot generation (softening X10) using a cache
using namespace std;
std::map < int, int > map;
int imageWidth, imageHeight, maxN;
double minR, maxR, minI, maxI;
imageWidth = 1024;
imageHeight = 1024;
maxN = 512;
minR = -1.5;
maxR = 0.7;
minI = -1.0;
maxI = 1.0;
size_t readmiss = 0;
size_t writemiss = 0;
size_t read = 0;
size_t write = 0;
LruClockCache < int, int> cache(1024*1024 1000,
[ & ](int key) {
readmiss ;
return map[key];
},
[ & ](int key, int value) {
writemiss ;
map[key] = value;
});
ofstream g("output_image.ppm");
g << "P6" << endl;
g << imageWidth << " " << imageHeight << endl;
g << "255" << endl;
double start = clock();
double t = 0;
for (int i = 0; i < imageHeight; i ) {
for (int j = 0; j < imageWidth; j ) {
double cr = mapToReal(j, imageWidth, minR, maxR);
double ci = mapToImaginary(i, imageHeight, minI, maxI);
cache.set(i j * imageWidth, findMandelbrot(cr, ci, maxN));
read ;
}
}
// image softening (may not be accurate)
for (int k = 0; k < 50; k ) {
for (int i = 1; i < imageHeight - 1; i ) {
for (int j = 1; j < imageWidth - 1; j ) {
const int && n0 = cache.get(i j * imageWidth);
const int && n1 = cache.get(i 1 j * imageWidth);
const int && n2 = cache.get(i - 1 j * imageWidth);
const int && n3 = cache.get(i (j 1) * imageWidth);
const int && n4 = cache.get(i (j - 1) * imageWidth);
const int && n = (n0 n1 n2 n3 n4) / 5;
cache.set(i j * imageWidth, n);
read = 5;
write ;
}
}
}
for (int i = 0; i < imageHeight; i ) {
for (int j = 0; j < imageWidth; j ) {
int n = cache.get(i j * imageWidth);
int r = ((int) sqrt(n) % 256);
int gr = (2 * n % 256);
int b = (n % 256);
write ;
g << (char) r << (char) gr << (char) b;
}
}
cout << "Finished!" << endl;
double stop = clock();
cout << (stop - start) / CLOCKS_PER_SEC;
cache.flush();
g.flush();
cout << endl << t << endl;
std::cout << (read - readmiss) / (double) read << std::endl;
std::cout << (write - writemiss) / (double) write << std::endl;
return 0;
}
Compiler settings have -O3.
CodePudding user response:
That's not moving between rax
and rdx
.
That's indexing into an array pointed to by rax
by rdx
and putting the result in rax
. Probable L1 cache miss.