Home > Blockchain >  Get data by OCR and set the columns names
Get data by OCR and set the columns names

Time:11-03

I run the code below to get the data of this picture but I encounter an error when try to set the column name.

library(magick)
library(tesseract)
team_img <- image_read("C:\\Users\\macosta\\Downloads\\Capture1.PNG") %>% 
  image_resize('2000x') %>%
  image_convert(type = 'Grayscale') %>%
  image_trim(fuzz = 40) %>%
  image_write(format = 'png', density = '300x300') %>%
  tesseract::ocr() %>%
  strsplit('\n') %>%
  getElement(1) %>%
  `[`(-1) %>%
  {read.table(text = .)} %>% # this part show an error
  setNames(c('Number', 'Weight', 'Specie', 'long', 'lat', 'altitude', 'Date', 'Others'))
> Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  : 
> line 2 did not have 9 elements
<- team_img
# [1] ""                                                    
# [2] "3057 4.2 vulgaris 90234\" W 14237N 1550 m 20/1/95"   
# [3] "3058 10.4 lunatus 90234°W 14237N 1550 m 20/1/95"     
# [4] "3059 12.0  lunatus 90246’W 14238’N 1700 m 20/1/95"   
# [5] "3060 4.7 ~~ vulgaris 90247'W 14242’°N 1330 m 20/1/95"
# [6] "3061 4.7 vulgaris 90247 W 14243’N 1350 m 20/1/95"    
# [7] "3062 10.8 coccineus 90951,W 14933N 2080m 20/1/95"    
# [8] "3063 6.8 __lunatus 90248°W —-14221’N 520 m 21/1/95"  
# [9] "3064 5.8 lunatus 91211\"W 14°24°N 230 m 21/1/95"     
# [10] "3065 7.0  lunatus 91208’W 14231’N 750 m 21/1/95"     
# [11] "3066 4.2 vulgaris 91207 W 14236°N 1330 m 21/1/95"    
# [12] "3067 23.3 polyanth. 91207°W 14238’N 1630 m 21/1/95"  
# [13] "3068 10.4 coccineus 91222’W 15221’N 2070 m 22/1/95"  
# [14] "3069 8.0 coccineus 91222’W 15221’N 2070 m 22/1/95"   
# [15] "3070. --- leptosta. 91221’W 15221’N 1910 m 22/1/95"  
# [16] "3071 7.8  lunatus 91254°W 15241’N 590 m 23/1/95"     
# [17] "3072 5.0 Cnido.sp. 91254°W 15241’N 600 m 23/1/95"    
# [18] "3073 0.6 _leptosta. 91248°W 15241’N 1000 m 23/1/95"  
# [19] "3074 3.9 vulgaris 91247'W 15240°N 1180 m 23/1/95"    
# [20] "3075 4.0 vulgaris 91942\"W 15939°N 1540 m 23/1/95"   
# [21] "3076 6.4 coccineus 91231’W 14249’°N 2370 m 24/1/95"  
# [22] "3077 9.5  coccineus 91229°W 14248°N 2070 m 24/1/95"  
# [23] "3078 64 coccineus 91229°W 14247N 2050 m 24/1/95"     
# [24] "3079 11.8 vulgaris 91229’W 14247'N 2050 m 24/1/95"   
# [25] "3080 10.5  lunatus 91230°W 14246’N 1750 m 24/1/95"   
# [26] "3081 6.9 vulgaris 91231’°W 14245’N 1730 m 24/1/95" 

Data picture.

enter image description here

The idea is to obtain in a DF.

CodePudding user response:

While the Tesseract OCR does a pretty admirable job of extracting text from the sample image, it seems to struggle with consistently decoding the ° and symbols in the Latitude and Longitude columns. There is also a space between the measure and the units in the Altitude column. This results in strings that are interpreted by read.table as having either 7, 8 or 9 columns. Since your first input line appears to have 9 columns, read.table expects all subsequent lines to have 9 columns as well. You will have to figure out a way to clean the latitude, longitude output data so that you have 8 space-delimited columns you have identified in setNames

library(magick)
library(tesseract)

team_str <- image_read("C:\\Users\\macosta\\Downloads\\Capture1.PNG") %>% 
  image_resize('2000x') %>%
  image_convert(type = 'Grayscale') %>%
  image_trim(fuzz = 40) %>%
  image_write(format = 'png', density = '300x300') %>%
  tesseract::ocr() %>%
  strsplit('\n') %>%
  getElement(1)

sapply(team_str, strsplit, '\\s ') %>% lapply(length)
$`Number, Weight Species Longitude Latitude Altitude Date`
[1] 7

[[2]]
[1] 0

$`3057 4.2 vulgaris 90234" W 14237N 1550 m 20/1/95`
[1] 9

$`3058 10.4 lunatus 90234°W 14237N 1550 m 20/1/95`
[1] 8

$`3059 12.0  lunatus 90246’W 14238’N 1700 m 20/1/95`
[1] 8

$`3060 4.7 ~~ vulgaris 90247'W 14242’°N 1330 m 20/1/95`
[1] 9

$`3061 4.7 vulgaris 90247 W 14243’N 1350 m 20/1/95`
[1] 9

$`3062 10.8 coccineus 90951,W 14933N 2080m 20/1/95`
[1] 7

$`3063 6.8 __lunatus 90248°W —-14221’N 520 m 21/1/95`
[1] 8

$`3064 5.8 lunatus 91211"W 14°24°N 230 m 21/1/95`
[1] 8

$`3065 7.0  lunatus 91208’W 14231’N 750 m 21/1/95`
[1] 8

$`3066 4.2 vulgaris 91207 W 14236°N 1330 m 21/1/95`
[1] 9

$`3067 23.3 polyanth. 91207°W 14238’N 1630 m 21/1/95`
[1] 8

$`3068 10.4 coccineus 91222’W 15221’N 2070 m 22/1/95`
[1] 8

$`3069 8.0 coccineus 91222’W 15221’N 2070 m 22/1/95`
[1] 8

$`3070. --- leptosta. 91221’W 15221’N 1910 m 22/1/95`
[1] 8

$`3071 7.8  lunatus 91254°W 15241’N 590 m 23/1/95`
[1] 8

$`3072 5.0 Cnido.sp. 91254°W 15241’N 600 m 23/1/95`
[1] 8

$`3073 0.6 _leptosta. 91248°W 15241’N 1000 m 23/1/95`
[1] 8

$`3074 3.9 vulgaris 91247'W 15240°N 1180 m 23/1/95`
[1] 8

$`3075 4.0 vulgaris 91942"W 15939°N 1540 m 23/1/95`
[1] 8

$`3076 6.4 coccineus 91231’W 14249’°N 2370 m 24/1/95`
[1] 8

$`3077 9.5  coccineus 91229°W 14248°N 2070 m 24/1/95`
[1] 8

$`3078 64 coccineus 91229°W 14247N 2050 m 24/1/95`
[1] 8

$`3079 11.8 vulgaris 91229’W 14247'N 2050 m 24/1/95`
[1] 8

$`3080 10.5  lunatus 91230°W 14246’N 1750 m 24/1/95`
[1] 8

$`3081 6.9 vulgaris 91231’°W 14245’N 1730 m 24/1/95`
[1] 8

Combining everything with data.table::rbindlist makes it easier to visualize what is happening and to devise a strategy to deal with malformed rows:

sapply(team_str, str_split, '\\s ') %>% 
  lapply(t) %>%
  lapply(as.data.frame) %>%
  data.table::rbindlist(fill = TRUE) %>%
  print
        V1     V2         V3        V4        V5       V6      V7      V8      V9
 1: Number, Weight    Species Longitude  Latitude Altitude    Date    <NA>    <NA>
 2:           <NA>       <NA>      <NA>      <NA>     <NA>    <NA>    <NA>    <NA>
 3:    3057    4.2   vulgaris    90234"         W   14237N    1550       m 20/1/95
 4:    3058   10.4    lunatus   90234°W    14237N     1550       m 20/1/95    <NA>
 5:    3059   12.0    lunatus   90246’W   14238’N     1700       m 20/1/95    <NA>
 6:    3060    4.7         ~~  vulgaris   90247'W 14242’°N    1330       m 20/1/95
 7:    3061    4.7   vulgaris     90247         W  14243’N    1350       m 20/1/95
 8:    3062   10.8  coccineus   90951,W    14933N    2080m 20/1/95    <NA>    <NA>
 9:    3063    6.8  __lunatus   90248°W —-14221’N      520       m 21/1/95    <NA>
10:    3064    5.8    lunatus   91211"W   14°24°N      230       m 21/1/95    <NA>
11:    3065    7.0    lunatus   91208’W   14231’N      750       m 21/1/95    <NA>
12:    3066    4.2   vulgaris     91207         W  14236°N    1330       m 21/1/95
13:    3067   23.3  polyanth.   91207°W   14238’N     1630       m 21/1/95    <NA>
14:    3068   10.4  coccineus   91222’W   15221’N     2070       m 22/1/95    <NA>
15:    3069    8.0  coccineus   91222’W   15221’N     2070       m 22/1/95    <NA>
16:   3070.    ---  leptosta.   91221’W   15221’N     1910       m 22/1/95    <NA>
17:    3071    7.8    lunatus   91254°W   15241’N      590       m 23/1/95    <NA>
18:    3072    5.0  Cnido.sp.   91254°W   15241’N      600       m 23/1/95    <NA>
19:    3073    0.6 _leptosta.   91248°W   15241’N     1000       m 23/1/95    <NA>
20:    3074    3.9   vulgaris   91247'W   15240°N     1180       m 23/1/95    <NA>
21:    3075    4.0   vulgaris   91942"W   15939°N     1540       m 23/1/95    <NA>
22:    3076    6.4  coccineus   91231’W  14249’°N     2370       m 24/1/95    <NA>
23:    3077    9.5  coccineus   91229°W   14248°N     2070       m 24/1/95    <NA>
24:    3078     64  coccineus   91229°W    14247N     2050       m 24/1/95    <NA>
25:    3079   11.8   vulgaris   91229’W   14247'N     2050       m 24/1/95    <NA>
26:    3080   10.5    lunatus   91230°W   14246’N     1750       m 24/1/95    <NA>
27:    3081    6.9   vulgaris  91231’°W   14245’N     1730       m 24/1/95    <NA>
         V1     V2         V3        V4        V5       V6      V7      V8      V9
  • Related