Home > database >  How to define a RandomAccessIterator over a pointer to a vector of chars?
How to define a RandomAccessIterator over a pointer to a vector of chars?

Time:11-16

I am implementing a kind of dataframe and I want to define a RandomAccessIterator over it, in order to execute the different std algorithms, such as the sorting one. The dataframe of the example contains two column "a" and "b":

a; b;
20; 21;
20; 19;
10; 11;
40; 41;
10; 11;

After sorting with a trivial selection sort this is the result:

a; b;
10; 11;
10; 11;
20; 19;
20; 21;
40; 41;

The problem that I am facing is that the std::sort does not work properly. And I don't know weather the implementation of the iterator is sound or not.


This is the code.

File: dataframe.hpp

#pragma once
#include <iostream>
#include <charconv>

#include <vector>
#include <memory>
#include <cstring>
#include <numeric>
#include "iterator.hpp"

namespace df
{
    class Record;
    class Column;
    class Dataframe;

    namespace types
    {
        enum class Base : char
        {
            CHAR = 'A',
            UNSIGNED = 'U',
            // Other types..
        };

        class Dtype
        {
        public:
            Dtype(types::Base base, std::size_t size) : m_base_dtype{base}, m_size{size} {}
            [[nodiscard]] auto name() const
            {
                return std::string{static_cast<char>(m_base_dtype)}   std::to_string(m_size);
            }
            [[nodiscard]] auto base() const { return m_base_dtype; }
            [[nodiscard]] auto size() const { return m_size; }
            [[nodiscard]] auto is_primitive() const
            {
                switch (base())
                {
                case types::Base::CHAR:
                    return size() == 1;
                case types::Base::UNSIGNED:
                    return size() == 1 or size() == 2 or size() == 4 or size() == 8;
                }
                return false;
            }

        private:
            types::Base m_base_dtype;
            std::size_t m_size;
        };

        [[nodiscard]] static auto CHAR(const std::size_t size) { return Dtype(types::Base::CHAR, size); }
        [[nodiscard]] static auto UNSIGNED(const std::size_t size) { return Dtype(types::Base::UNSIGNED, size); }
    }

    class Column
    {
    public:
        Column(std::vector<char> &raw, const types::Dtype dtype) : m_raw{std::move(raw)}, m_dtype{dtype} {}
        Column &operator=(Column &&c) = default; // Move constructor
        [[nodiscard]] const auto &dtype() const { return m_dtype; }
        [[nodiscard]] auto &raw() { return m_raw; }
        [[nodiscard]] const auto &raw() const { return m_raw; }
        [[nodiscard]] auto *data() { return m_raw.data(); }
        [[nodiscard]] const auto *data() const { return m_raw.data(); }

    private:
        std::vector<char> m_raw;
        types::Dtype m_dtype;
    };

    class Dataframe
    {
    public:
        Dataframe(std::vector<char> &raw, std::vector<std::string> names, std::vector<types::Dtype> dtypes)
        {
            m_raw = std::move(raw);
            m_column_dtypes = dtypes;
            m_column_names = names;
            m_record_size = 0;
            for (const auto dt : dtypes)
            {
                m_column_offsets.emplace_back(m_record_size);
                m_record_size  = dt.size();
            }
            m_record_count = m_raw.size() / m_record_size;
        }
        Dataframe(std::vector<char> &raw, std::vector<types::Dtype> dtypes) : Dataframe(raw, {}, dtypes) {}
        Dataframe &operator=(Dataframe &&c) = default; // Move constructor
        [[nodiscard]] auto &raw() { return m_raw; }
        [[nodiscard]] const auto &raw() const { return m_raw; }
        [[nodiscard]] auto *data() { return m_raw.data(); }
        [[nodiscard]] const auto *data() const { return m_raw.data(); }

        // Iterators
        [[nodiscard]] df::Iterator begin()
        {
            return df::Iterator{m_raw.data(), m_record_size};
        }
        [[nodiscard]] df::Iterator end()
        {
            return df::Iterator{m_raw.data()   m_raw.size(), m_record_size};
        }

        [[nodiscard]] auto shape() const { return std::make_pair(m_record_count, m_column_dtypes.size()); }
        [[nodiscard]] auto record_count() const { return m_record_count; }
        [[nodiscard]] auto record_size() const { return m_record_size; }
        [[nodiscard]] const auto &names() const { return m_column_names; }
        [[nodiscard]] const auto &dtypes() const { return m_column_dtypes; }
        [[nodiscard]] const auto &offsets() const { return m_column_offsets; }
        void print() { print(m_record_count); }
        void print(const std::size_t initial_records)
        {
            //  Print header
            for (auto column_name : m_column_names)
            {
                std::cout << column_name << "; ";
            }
            std::cout << std::endl;

            // Print rows
            std::size_t records_to_print = std::min(initial_records, m_record_count);
            for (std::size_t i = 0; i < records_to_print; i  )
            {
                const auto start_p = i * record_size();
                auto start_field = 0;
                auto end_field = 0;
                for (auto field : m_column_dtypes)
                {
                    end_field  = field.size();

                    switch (field.base())
                    {
                    case types::Base::UNSIGNED:
                    {
                        std::uint64_t uint_value = 0;
                        memcpy(&uint_value, m_raw.data()   start_p   start_field, field.size());
                        std::cout << uint_value;
                        break;
                    }
                    case types::Base::CHAR:
                    {
                        std::string str_value = std::string(m_raw.data()   start_p   start_field, field.size());
                        std::cout << str_value;
                        break;
                    }
                    }

                    start_field = end_field;
                    // New column
                    std::cout << "; ";
                }
                // New row
                std::cout << std::endl;
            }
        }

        std::shared_ptr<Dataframe> copy() const
        {
            auto x = std::vector<char>(m_raw);
            return std::make_shared<Dataframe>(x, std::vector<std::string>(m_column_names), std::vector<types::Dtype>(m_column_dtypes));
        }

    private:
        std::vector<char> m_raw = {};
        std::vector<std::string> m_column_names = {};
        std::vector<types::Dtype> m_column_dtypes = {};
        std::vector<std::size_t> m_column_offsets = {};
        std::size_t m_record_size = {};
        std::size_t m_record_count = {};
    };

    using namespace types;

    static std::shared_ptr<Dataframe> read_from_vector(const std::vector<std::vector<std::string>> values, const std::vector<std::string> names, const std::vector<Dtype> dtypes)
    {
        const auto record_size = std::accumulate(dtypes.begin(), dtypes.end(), std::size_t{0},
                                                 [](std::size_t accum, const auto &m)
                                                 { return accum   m.size(); });

        const auto total_size = values.size() * record_size;
        const std::size_t INCR_RECORDS = std::max(total_size / (10 * record_size), std::size_t{65536});

        auto raw = std::vector<char>{};
        std::size_t written_records = 0;
        auto offsets = std::vector<std::size_t>{};

        for (int offset = 0; const auto &kd : dtypes)
        {
            offsets.push_back(offset);
            offset  = kd.size();
        }

        for (auto value : values)
        {
            if (written_records >= raw.size() / record_size)
            {
                raw.resize(raw.size()   INCR_RECORDS * record_size, char{' '});
            }

            for (int i = 0; i < names.size(); i  )
            {
                const auto name = names[i];
                const auto dtype = dtypes[i];
                const auto offset = offsets[i];

                const auto pos = written_records * record_size   offset;

                switch (dtype.base())
                {
                case df::Base::CHAR:
                {
                    const auto v = value[i];
                    const auto byte_to_copy = std::min(v.size(), dtype.size());
                    std::memcpy(raw.data()   pos,
                                v.data()   v.size() - byte_to_copy, byte_to_copy); // Prendo gli ultimi byte
                    break;
                }
                case df::Base::UNSIGNED:
                {
                    const auto v = std::stoull(value[i]);
                    const auto byte_to_copy = dtype.size();
                    std::memcpy(raw.data()   pos, &v, byte_to_copy); // Prendo gli ultimi byte
                    break;
                }
                default:
                    throw std::runtime_error("ColumnType non riconosciuto");
                }
            }

            written_records  ;
        }

        raw.resize(written_records * record_size);
        raw.shrink_to_fit();

        return std::make_shared<Dataframe>(raw, names, dtypes);
    }

}

File: iterator.hpp

#pragma once
#include <iostream>
#include <cstring>

namespace df
{

    class Iterator
    {
        std::size_t size;
        char *ptr;

    public:
        struct record_reference;

        struct record_value
        {
            std::size_t size;
            char *ptr;

            record_value(const record_reference &t) : record_value(t.size, t.ptr){};

            record_value(const std::size_t m_size, char *m_ptr)
            {
                this->size = m_size;
                this->ptr = new char[this->size];
                std::memcpy(ptr, m_ptr, this->size);
            }

            ~record_value()
            {
                delete[] this->ptr;
            }
        };

        struct record_reference
        {
            std::size_t size;
            char *ptr;

            record_reference(const std::size_t m_size, char *m_ptr)
            {
                this->size = m_size;
                this->ptr = m_ptr;
            }

            record_reference(const record_reference &t)
            {
                this->size = t.size;
                this->ptr = t.ptr;
            }

            // record_reference(const record_value &t) : record_reference(t.size, t.ptr) {};

            record_reference &operator=(const record_value &t)
            {
                std::memcpy(ptr, t.ptr, size);
                return *this;
            }

            record_reference &operator=(const record_reference &t)
            {
                std::memcpy(ptr, t.ptr, size);
                return *this;
            }

            record_reference &operator=(char *t)
            {
                std::memcpy(ptr, t, size);
                return *this;
            }

            operator char *()
            {
                return ptr;
            }
            operator const char *() const { return ptr; }
        };

        using iterator_category = std::random_access_iterator_tag;
        using value_type = record_value;
        using reference = record_reference;
        using difference_type = std::ptrdiff_t;

        // default constructible
        Iterator() : size(0), ptr(nullptr)
        {
        }

        // copy assignable
        Iterator &operator=(const Iterator &t)
        {
            size = t.size;
            ptr = t.ptr;
            return *this;
        }

        Iterator(char *ptr, const std::size_t size) : size{size}, ptr(ptr)
        {
        }

        record_reference operator*() const
        {
            return {size, ptr};
        }

        // Prefix
        Iterator &operator  ()
        {
            ptr  = size;
            return *this;
        }
        // Postfix
        Iterator operator  (int)
        {
            auto tmp = *this;
              *this;
            return tmp;
        }
        Iterator &operator--()
        {
            ptr -= size;
            return *this;
        }

        difference_type operator-(const Iterator &it) const
        {
            return (this->ptr - it.ptr) / size;
        }

        Iterator operator (const difference_type &offset) const
        {
            return Iterator(ptr   offset * size, size);
        }

        friend Iterator operator (const difference_type &diff, const Iterator &it)
        {
            return it   diff;
        }

        Iterator operator-(const difference_type &diff) const
        {
            return Iterator(ptr - diff * size, size);
        }

        reference operator[](const difference_type &offset) const
        {
            return {size, ptr   offset * size};
        }

        bool operator==(const Iterator &it) const
        {
            return this->ptr == it.ptr;
        }
        bool operator!=(const Iterator &it) const
        {
            return !(*this == it);
        }
        bool operator<(const Iterator &it) const
        {
            return this->ptr < it.ptr;
        }
        bool operator>=(const Iterator &it) const
        {
            return this->ptr >= it.ptr;
        }
        bool operator>(const Iterator &it) const
        {
            return this->ptr > it.ptr;
        }
        bool operator<=(const Iterator &it) const
        {
            return this->ptr <= it.ptr;
        }

        Iterator &operator =(const difference_type &diff)
        {
            ptr  = diff * size;
            return *this;
        }

        operator Iterator() const
        {
            return Iterator(ptr, size);
        }
    };


    void swap(df::Iterator::record_reference a, df::Iterator::record_reference b)
    {
        unsigned char *p;
        unsigned char *q;
        unsigned char *const sentry = (unsigned char *)a.ptr   a.size;

        for (p = (unsigned char *)a.ptr, q = (unsigned char *)b.ptr; p < sentry;   p,   q)
        {
            const unsigned char t = *p;
            *p = *q;
            *q = t;
        }
    }
}

File: comparator.hpp

#pragma once
#include <memory>
#include <functional>
#include "dataframe.hpp"
#include "iterator.hpp"

namespace compare
{

    using comparator_fn = std::function<int(const df::Iterator::record_reference, const df::Iterator::record_reference)>;

    template <typename T, std::size_t offset = 0, std::size_t size = sizeof(T)>
    static inline comparator_fn make_comparator()
    {
        if constexpr (size == 3 or size == 5 or size == 7 or size > 8)
            return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
            { return std::memcmp(a   offset, b   offset, size); };

        return [](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
        { return *(T *)(a   offset) < *(T *)(b   offset) ? -1 : *(T *)(b   offset) < *(T *)(a   offset) ?  1
                                                                                                        : 0; };
    }

    template <typename T>
    static inline comparator_fn make_comparator(const std::size_t offset)
    {
        return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
        { return *(T *)(a   offset) < *(T *)(b   offset) ? -1 : *(T *)(b   offset) < *(T *)(a   offset) ?  1
                                                                                                        : 0; };
    }

    static inline comparator_fn make_column_comparator(const df::Dtype dtype, const std::size_t offset)
    {
        switch (dtype.base())
        {
        case df::Base::CHAR:
        {
            if (dtype.size() == 1)
                return make_comparator<std::uint8_t>(offset);
            else if (dtype.size() == 2)
                return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
                { return std::memcmp(a   offset, b   offset, 2); }; // C'� qualche beneficio a fissare il 2? o conviene trattarlo come uno unsigned short?

            return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
            { return std::memcmp(a   offset, b   offset, dtype.size()); };
        }
        case df::Base::UNSIGNED:
        {
            return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
            {
                std::uint64_t uint_value_a = 0;
                std::uint64_t uint_value_b = 0;
                std::memcpy(&uint_value_a, a   offset, dtype.size());
                std::memcpy(&uint_value_b, b   offset, dtype.size());
                return (uint_value_a < uint_value_b ? -1 : uint_value_a > uint_value_b ?  1
                                                                                       : 0);
            };
        }
        default:
            throw std::runtime_error("Unsupported dtype");
            break;
        }
    }

    static inline comparator_fn make_composite_two_way_comparator(const std::shared_ptr<df::Dataframe> &T)
    {
        const auto K = T->dtypes().size();
        std::vector<comparator_fn> F;

        for (int i = 0; i < K; i  )
        {
            F.emplace_back(make_column_comparator(T->dtypes()[i], T->offsets()[i]));
        }

        const auto comparator = [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
        {
            for (int i = 0; i < K; i  )
            {
                // If equal go to the next column, otherwise return the result
                // The return value is true if the first argument is less than the second
                // and false otherwise
                if (const auto result = F[i](a, b); result != 0)
                    return result < 0;
            }
            return false;
        };

        return comparator;
    }
}

File: main.cpp

#include <iostream>
#include <vector>

#include "dataframe.hpp"
#include "comparator.hpp"

template <typename RandomAccessIterator, typename Comparator>
static void selection_sort(RandomAccessIterator first, RandomAccessIterator last, Comparator comp)
{
    for (auto i = first; i != last;   i)
    {
        auto min = i;
        for (auto j = i   1; j != last;   j)
        {
            if (comp(*j, *min))
                min = j;
        }
        df::Iterator::value_type temp = *i;
        *i = *min;
        *min = temp;
        // Alternative
        // std::iter_swap(i, min);
    }
}

int main(int argc, char const *argv[])
{
    std::vector<std::string> values{"20", "21", "20", "19", "10", "11", "40", "41", "10", "11"};

    // Create a vector that contains values grouped by 2
    std::vector<std::vector<std::string>> v;
    for (int i = 0; i < values.size(); i  = 2)
    {
        std::vector<std::string> temp;
        temp.push_back(values[i]);
        temp.push_back(values[i   1]);
        v.push_back(temp);
    }
    std::vector<std::string> column_names = {"a", "b"};
    df::Dtype d = df::Dtype(df::Base::UNSIGNED, 4);
    std::vector dtypes = {d, d};

    // Create a dataframe
    std::shared_ptr<df::Dataframe> df = df::read_from_vector(v, column_names, dtypes);

    std::cout << "Before sorting" << std::endl;
    df->print();
    // This comparator sorts the dataframe first by column a and then by column b in ascending order
    auto comparator = compare::make_composite_two_way_comparator(df);
    selection_sort(df->begin(), df->end(), comparator);
    
    std::cout << "\nAfter sorting" << std::endl;
    df->print();

    // With the std::sort it does not work
    std::sort(df->begin(), df->end(), comparator);

    return 0;
}

CodePudding user response:

Your type is not a C 17 RandomAccessIterator, because it isn't a C 17 ForwardIterator, because reference is an object type, not a reference type.

The type It satisfies ForwardIterator if

  • Let T be the value type of It. The type std::iterator_traits<It>::reference must be either
    • T& or T&& if It satisfies OutputIterator (It is mutable), or
    • const T& or const T&& otherwise (It is constant),

(Other requirements elided)

You will be able to satisfy the C 20 concept std::random_access_iterator, because that relaxes the requirement on It::reference.

CodePudding user response:

In C 17, the reference type of an iterator must be precisely value_type& in order for that iterator to be random access. Only input iterators can have the reference type be something other than value_type&. So in C 17, proxy iterators are limited to input iterators. And every algorithm written against C 17 has this expectation.

The C 20 ranges library adds the ability to have random access proxy iterators. And the C 20 algorithms that use those range concepts will respect them.

  • Related