Home > Mobile >  Haskell cassava (Data.Csv): Carry along additional columns
Haskell cassava (Data.Csv): Carry along additional columns

Time:12-02

I have two .csv files

A.csv:

A,B,C,D,E
1,2,3,4,5
5,4,3,2,1

B.csv

A,E,B,C,F
6,7,8,9,1
4,3,4,5,6

I would like to read them in Haskell with strict parsing rules for the variables A, B and C. I would then like to apply complex merge and filter operations to the rows of A.csv and B.csv and create a file C.csv from the result. The code block at the end of this post essentially covers this functionality.

Question:

I would now like to do all of this while keeping the variables D, E and F around. In my real dataset I have an unknown and arbitrary number of such additional columns. I can not easily represent them in the respective data type (ABC below). All of them should stay and be properly represented in the output dataset.

With the code below, C.csv looks like this:

A,B,C
1,2,3
5,4,3
6,8,9
4,4,5

I would instead like to have a result like this:

A,B,C,D,E,F
1,2,3,4,5,_
5,4,3,2,1,_
6,8,9,_,7,1
4,4,5,_,3,6

Is there a way to do this with cassava? Do I have to write a custom parser from scratch to get this functionality? How would I go about this?


This example code lacks the desired feature. It is a self-contained stack script.

#!/usr/bin/env stack
-- stack --resolver lts-18.7 script --package cassava,bytestring,vector

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

import qualified Data.ByteString.Lazy as B
import qualified Data.Csv as C
import qualified Data.Vector as V

data ABC = ABC {a :: Int, b :: Int, c :: Int} deriving Show

instance C.FromNamedRecord ABC where
    parseNamedRecord m =
        ABC <$> m C..: "A" <*> m C..: "B" <*> m C..: "C"

instance C.ToNamedRecord ABC where
    toNamedRecord ABC {..} =
        C.namedRecord ["A" C..= a, "B" C..= b, "C" C..= c]

decodeABC :: B.ByteString -> [ABC]
decodeABC x =
    case C.decodeByName x of
        Left err -> error err
        Right (_,xs) -> V.toList xs

header :: C.Header
header = V.fromList ["A", "B", "C"]

main :: IO ()
main = do
    fileA <- B.readFile "A.csv"
    fileB <- B.readFile "B.csv"
    let decodedA = decodeABC fileA
    let decodedB = decodeABC fileB
    putStrLn $ show decodedA
    putStrLn $ show decodedB
    B.writeFile "C.csv" $ C.encodeByName header (decodedA    decodedB)

This code includes the desired feature (thanks to the input of @Daniel Wagner):

#!/usr/bin/env stack
-- stack --resolver lts-18.7 script --package cassava,bytestring,vector,unordered-containers

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

import qualified Data.ByteString.Lazy as B
import qualified Data.Csv as C
import qualified Data.HashMap.Strict as HM
import qualified Data.Vector as V

data ABC = ABC {a :: Int, b :: Int, c :: Int, addCols :: C.NamedRecord} deriving Show

abcDefinedCols = ["A", "B", "C"]
abcRefHashMap = HM.fromList $ map (\x -> (x, ())) abcDefinedCols

instance C.FromNamedRecord ABC where
    parseNamedRecord m =
        pure ABC
        <*> m C..: "A"
        <*> m C..: "B"
        <*> m C..: "C"
        <*> pure (m `HM.difference` abcRefHashMap)

instance C.ToNamedRecord ABC where
    toNamedRecord m =
        (addCols m) `HM.union` C.namedRecord ["A" C..= a m, "B" C..= b m, "C" C..= c m]

decodeABC :: B.ByteString -> [ABC]
decodeABC x =
    case C.decodeByName x of
        Left err -> error err
        Right (_,xs) -> V.toList xs

makeCompleteHeader :: [ABC] -> C.Header
makeCompleteHeader ms = V.fromList $ abcDefinedCols    HM.keys (HM.unions (map addCols ms))

combineABCs :: [ABC] -> [ABC] -> [ABC]
combineABCs xs1 xs2 =
    let simpleSum = xs1    xs2
        addColKeys = HM.keys (HM.unions (map addCols simpleSum))
        toAddHashMap = HM.fromList (map (\k -> (k, "n/a")) addColKeys)
    in map (\x -> x { addCols = fillAddCols (addCols x) toAddHashMap }) simpleSum
    where
        fillAddCols :: C.NamedRecord -> C.NamedRecord -> C.NamedRecord
        fillAddCols cur toAdd = HM.union cur (toAdd `HM.difference` cur)

main :: IO ()
main = do
    fileA <- B.readFile "A.csv"
    fileB <- B.readFile "B.csv"
    let decodedA = decodeABC fileA
    let decodedB = decodeABC fileB
    putStrLn $ show decodedA
    putStrLn $ show decodedB
    let ab = combineABCs decodedA decodedB
    B.writeFile "C.csv" $ C.encodeByName (makeCompleteHeader ab) ab

CodePudding user response:

data ABCPlus = ABCPlus { a :: Int, b :: Int, c :: Int, d :: NamedRecord } deriving Show

instance FromNamedRecord ABCPlus where
    parseNamedRecord m = pure ABC
        <*> m .: "A"
        <*> m .: "B"
        <*> m .: "C"
        <*> pure m -- or perhaps: pure (m `HM.difference` HM.fromList [("A", ()), ("B", ()), ("C", ())])

instance ToNamedRecord ABCPlus where
     toNamedRecord m = d m -- or perhaps: d m `HM.union` namedRecord ["A" .= a m, "B" .= b m, "C" .= c m]

headers :: [ABCPlus] -> Header
headers ms = header $ ["A", "B", "C"]    HM.keys (relevant combined) where
    relevant m = m `HM.difference` HM.fromList [("A", ()), ("B", ()), ("C", ())] -- or perhaps: m
    combined = HM.unions [relevantKeys (d m) | m <- ms]
  • Related