Home > database >  How to increase C performance while processing cyrillic text files?
How to increase C performance while processing cyrillic text files?

Time:09-28

I am developing a C program to analyze letter occurrences in Russian text. It works as planned, but its performance is really poor (to compare my other python program completes this task in nearly 10 seconds while this one in approximately 7 minutes).

So my question is how to increase performance? What should I read to develop an understanding of this issue?

Main

#include <iostream>
#include <io.h>
#include <fcntl.h>
#include "text.h"

int main()
{
_setmode(_fileno(stdout), _O_U16TEXT);
Text_Container mytext("./text.txt");
mytext.initialize();
//std::wcout<< mytext.Display_text()<<std::endl;
mytext.print_dict(0);
mytext.print_dict(1);

return 0;
}

Class.cpp


void Text_Container::print_map(std::wstring_view comment, const std::map<wchar_t, wchar_t>& m)
{
    std::wcout << comment;
    for (const auto& [key, value] : m) {
        std::wcout << key << L" = " << value << L"; ";
    }
    std::wcout << L"\n";
}
void Text_Container::print_dict(int mode)
{
    if (mode == 0) {
        std::wcout << "Dictionary with no whitespaces\n";
        for (const auto& [key, value] : _dict) {
            std::wcout << L"'" << key << L"' = " << value << L";\n";
        }
        std::wcout << L"\n";
    }
    if (mode == 1) {
        std::wcout << "Dictionary with whitespaces\n";
        for (const auto& [key, value] : _dict_w) {
            std::wcout << L"'" << key << L"' = " << value << L";\n";
        }
        std::wcout << L"\n";
    }
}
void Text_Container::read_file(const char* filename)
{
    std::wifstream wif(filename);
    wif.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
    std::wstringstream wss;
    wss << wif.rdbuf();
    _text = wss.str();
}

void Text_Container::initialize()
{
    /// <summary>
    /// lowercase text
    /// </summary>
    /// <param name="mode"></param>
    std::wstring new_s;
    for (std::wstring::size_type i = 0; i < _text.size(); i  ) {
        wchar_t temp = iterate_over_map(_text[i]);
        if (temp != '\1')
        {   //whitespaces
            wchar_t temp0 = iterate_over_dictionary(_text[i], 0);
            if (temp0 != '\1')
            {
                _f_w_text = _f_w_text   temp0;
            };
            //no whitespaces
            wchar_t temp1 = iterate_over_dictionary(_text[i], 1);
            if (temp1 != '\1')
            {
                _f_text = _f_text   temp1;
            };
            new_s = new_s   temp;
        };
    }
    _text = new_s;
}
//this function pring russian letters to lowercase
wchar_t Text_Container::iterate_over_map(wchar_t& temp) {
    if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
        temp = L'е';
    }
    if (temp == L'Ъ' || temp == L'ъ') {
        temp = L'ь';
    }
    for (const auto& [key, value] : _m) {
        if (temp == value) { return value; }
        else { if (temp == key) { return value; } }
    }
    return L'\1';
}
//this fucntion verifies input letter and if it's in the selected dictionary increases value by 1
wchar_t Text_Container::iterate_over_dictionary(wchar_t& temp, int mode) {
    std::map<wchar_t, int>::iterator itr;
    if (mode == 0)
    {
        for (itr = _dict_w.begin(); itr != _dict_w.end();   itr) {
            if (itr->first == temp) {
                itr->second  ;
                return itr->first;
            }
        }
    }
    if (mode == 1)
    {
        for (itr = _dict.begin(); itr != _dict.end();   itr) {
            if (itr->first == temp) {
                itr->second  ;
                return itr->first;
            }
        }
    }
    return L'\1';
}

Class.h

#define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
#include <string_view>
#include <sstream>
#include <fstream>
#include <codecvt>
#include <iostream>
#include <string>
#include <map>
#include <string_view>

class Text_Container
{
private:
    //dictionaries to filter text & also bring them to lowercase
    const std::map<wchar_t, wchar_t> _m{ {L'А', L'а'}, {L'Б', L'б'}, {L'В', L'в'},
                                            {L'Г', L'г'}, {L'Д', L'д'}, {L'Е', L'е'},
                                            {L'Ж', L'ж'}, {L'З', L'з'}, {L'И', L'и'},
                                            {L'Й', L'й'}, {L'К', L'к'}, {L'Л', L'л'},
                                            {L'М', L'м'}, {L'Н', L'н'}, {L'О', L'о'},
                                            {L'П', L'п'}, {L'Р', L'р'}, {L'С', L'с'},
                                            {L'Т', L'т'}, {L'У', L'у'}, {L'Ф', L'ф'},
                                            {L'Ч', L'ч'}, {L'Ц', L'ц'}, {L'Ш', L'ш'},
                                            {L'Щ', L'щ'}, {L'Ы', L'ы'}, {L'Ь', L'ь'},
                                            {L'Ю', L'ю'}, {L'Я', L'я'}, {L' ', L' '}, };
    /*
    std::map<wchar_t, int> _dict_w{ {L'а', 0}, {L'б', 0}, {L' ',0},}; //mode 0
    std::map<wchar_t, int> _dict{ {L'а', 0}, {L'б', 0},}; //mode 1
    */
    //mode 0
    std::map<wchar_t, int> _dict_w{ {L'а', 0}, {L'б', 0}, {L'в', 0},
                                            {L'г', 0}, {L'д', 0}, {L'е', 0},
                                            {L'ж', 0}, {L'з', 0}, {L'и', 0},
                                            {L'й', 0}, {L'к', 0}, {L'л', 0},
                                            {L'м', 0}, {L'н', 0}, {L'о', 0},
                                            {L'п', 0}, {L'р', 0}, {L'с', 0},
                                            {L'т', 0}, {L'у', 0}, {L'ф', 0},
                                            {L'ч', 0}, {L'ц', 0}, {L'ш', 0},
                                            {L'щ', 0}, {L'ы', 0}, {L'ь', 0},
                                            {L'ю', 0}, {L'я', 0}, {L' ', 0}, };

    //mode 1
    std::map<wchar_t, int> _dict{ {L'а', 0}, {L'б', 0}, {L'в', 0},
                                            {L'г', 0}, {L'д', 0}, {L'е', 0},
                                            {L'ж', 0}, {L'з', 0}, {L'и', 0},
                                            {L'й', 0}, {L'к', 0}, {L'л', 0},
                                            {L'м', 0}, {L'н', 0}, {L'о', 0},
                                            {L'п', 0}, {L'р', 0}, {L'с', 0},
                                            {L'т', 0}, {L'у', 0}, {L'ф', 0},
                                            {L'ч', 0}, {L'ц', 0}, {L'ш', 0},
                                            {L'щ', 0}, {L'ы', 0}, {L'ь', 0},
                                            {L'ю', 0}, {L'я', 0}, };
    //vars
    std::wstring _text = L""; // inintial text
    std::wstring _f_text = L""; //mode 1
    std::wstring _f_w_text = L"";// mode 0
    //methods
    wchar_t iterate_over_map(wchar_t& temp);
    wchar_t iterate_over_dictionary(wchar_t& temp, int mode);
public:
    //constructor
    Text_Container(const char* filename) { read_file(filename); };
    //destructor

    //methods
    std::wstring Display_text() { return _text; }
    std::map<wchar_t, wchar_t> give_m() { return _m; };

    void print_map(std::wstring_view comment, const std::map<wchar_t, wchar_t>& m);
    void read_file(const char* filename);
    void print_dict(int mode);
    void initialize();
};

CodePudding user response:

There are a few areas to investigate for slowness. Starting from biggest to slowest.

Do you need to iterate over the maps instead of using at/get? This can result in a huge performance boost

for (itr = _dict_w.begin(); itr != _dict_w.end();   itr) {
            if (itr->first == temp) {
                itr->second  ;
                return itr->first;
            }
        }

As well as:

wchar_t Text_Container::iterate_over_map(wchar_t& temp) {
    if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
        temp = L'е';
    }
    if (temp == L'Ъ' || temp == L'ъ') {
        temp = L'ь';
    }
    for (const auto& [key, value] : _m) {
        if (temp == value) { return value; }
        else { if (temp == key) { return value; } }
    }
    return L'\1';
}

Lets talk templates/datatypes, are maps required all your datasets?

  • Can try using enums.

It could significantly help this section here. Especially if you incorporate some bitmasking

if (temp == L'Ё' || temp == L'Э' || temp == L'ё' || temp == L'э') {
    temp = L'е';
}
if (temp == L'Ъ' || temp == L'ъ') {
    temp = L'ь';
}
for (const auto& [key, value] : _m) {
    if (temp == value) { return value; }
    else { if (temp == key) { return value; } }
}
return L'\1';

It may be insightful if you could log the time it takes for certain functions/processes.

  • How long does it take to read a single char?
  • How long does it take to read X chars?

Some questions to consider:

  • What is your RAM looking like?
  • How big are the files you are reading?
    • Would it help to slice them up?
  • Passing by value vs passing by reference/pointer
    • Passing by reference and setting function returns to void may help.

Lastly, I want to say that I'm not an expert at this either. So I'm interested in seeing what others have to say.

CodePudding user response:

Well this looks like a very expensive operation.

std::wstringstream wss;
wss << wif.rdbuf();
_text = wss.str();

You read the while file into a string stream wss (so that is one huge copy). If the file is large the internal buffer may be extended multiple times each forcing a copy of the string to a new location.

Then once you have build the stream you copy it into a string _text. So that is another huge copy.

Does not look like any of this is needed. Simply read from wif rather than from _text.

  • Related