Turn stringified tree into a list of directories (Javascript)-CodePudding

I am in need of building a function that can transform a string (which includes information about a tree structure) into a list of separated directories.

This is an example string that carries the information about the structure:

"bucket[f1[a&b&f[]&ff1[a&b]]&f2[a&b&ff1[a&b]]&f3[ff1[a]]]"

Brackets [] are appearing only after folder name and show the range of the folder (empty ones mean an empty folder). All items are separated by &.

And this is what the output should look like:

[ '/bucket/f1/a',
  '/bucket/f1/b',
  '/bucket/f1/f/',   // <- directory to folder
  '/bucket/f1/ff1/a',
  '/bucket/f1/ff1/b',
  '/bucket/f2/a',
  '/bucket/f2/b',
  '/bucket/f2/ff1/a',
  '/bucket/f2/ff1/b',
  '/bucket/f3/ff1/a' ]

I have tried an recursive algorithm that uses regex to get the content of the brackets and splits with split('&') but I failed to distinguish the & on different "directory depths" using regex.

CodePudding user response：

background

This is a non-trivial task but offers a great opportunity to learn about lexing and parsing. This is an essential step for all compilers of any programming language. Maybe this is your first time writing a language and interpreter. Maybe you didn't know your input string is a miniature language. Either way I hope you find this post useful. Please ask any follow-up questions if you are stuck.

lexer

Write lex to return a stream of token. We defined open, close, and, and identifier tokens.

function *lex(t) {
  for (const lexeme of t.match(/([a-z0-9] |[\[\]&])/g)) {
    s: switch (lexeme) {
      case "[":
        yield { open: true }; break s
      case "]":
        yield { close: true }; break s
      case "&":
        yield { and: true }; break s
      default:
        yield { identifier: lexeme }; break s
    }
  }
}

Our program input is the string of syntax in your question -

const program =
  "bucket[f1[a&b&f[]&ff1[a&b]]&f2[a&b&ff1[a&b]]&f3[ff1[a]]]"

for (const v of lex(program))
  console.log(v)

Here's the tokens we get -

{identifier: "bucket"}
{open: true}
{identifier: "f1"}
{open: true}
{identifier: "a"}
{and: true}
{identifier: "b"}
{and: true}
{identifier: "f"}
{open: true}
{close: true}
{and: true}
{identifier: "ff1"}
{open: true}
{identifier: "a"}
{and: true}
{identifier: "b"}
{close: true}
{close: true}
{and: true}
{identifier: "f2"}
{open: true}
{identifier: "a"}
{and: true}
{identifier: "b"}
{and: true}
{identifier: "ff1"}
{open: true}
{identifier: "a"}
{and: true}
{identifier: "b"}
{close: true}
{close: true}
{and: true}
{identifier: "f3"}
{open: true}
{identifier: "ff1"}
{open: true}
{identifier: "a"}
{close: true}
{close: true}
{close: true}

parse

Next we have to parse the tokens and give them meaning. For each token, we check which token we have and modify the returned value (r) accordingly -

function parse(t, base = "") {
  const r = [[base]]
  let _0, _1
  for (const v of t) {
    s: switch (true) {
      case v.open:
        _0 = r.pop()
        r.push(_0)
        r.push([_0.pop()])
        break s
      case v.and:
        break s
      case v.close:
        _0 = r.pop()
        _1 = r.pop()
        _1.push(_0)
        r.push(_1)
        break s
      default:
        _0 = r.pop()
        _0.push(v.identifier)
        r.push(_0)
        break s
    }
  }
  return r[0]
}

We feed the output of lex to parse -

console.log(parse(lex(program)))

And we get back a tree of our file structure -

[
  "",
  [
    "bucket",
    [
      "f1",
      "a",
      "b",
      [
        "f"
      ],
      [
        "ff1",
        "a",
        "b"
      ]
    ],
    [
      "f2",
      "a",
      "b",
      [
        "ff1",
        "a",
        "b"
      ]
    ],
    [
      "f3",
      [
        "ff1",
        "a"
      ]
    ]
  ]
]

flatten

Finally we write a flatten function which flattens our file tree to a list of paths. Because your program asks for trailing slash on directories without children, we make a special call out in our code -

function *flatten(t) {
  switch (t?.constructor) {
    case Array:
      if (t.length === 1) // special case for empty dir
        yield [t[0], ""]
      else
        for (const child of t.slice(1))
          for (const path of flatten(child))
            yield [t[0], ...path]
      break
    default:
      yield [t]
  }
}

We take the output from parse and feed it to flatten -

for (const path of flatten(parse(lex(program))))
  console.log(path.join("/"))

Each path is like ["", "bucket", "f1", "a"] allowing the caller to manipulate them however they see fit. We construct a string, joining each segment with "/" -

/bucket/f1/a
/bucket/f1/b
/bucket/f1/f/
/bucket/f1/ff1/a
/bucket/f1/ff1/b
/bucket/f2/a
/bucket/f2/b
/bucket/f2/ff1/a
/bucket/f2/ff1/b
/bucket/f3/ff1/a

demo

Here's a fully functioning demo. Run it below to verify the results.

function *lex(t) {
  for (const lexeme of t.match(/([a-z0-9] |[\[\]&])/g)) {
    s: switch (lexeme) {
      case "[":
        yield { open: true }; break s
      case "]":
        yield { close: true }; break s
      case "&":
        yield { and: true }; break s
      default:
        yield { identifier: lexeme }; break s
    }
  }
}

function parse(t, base = "") {
  const r = [[base]]
  let _0, _1
  for (const v of t) {
    s: switch (true) {
      case v.open:
        _0 = r.pop()
        r.push(_0)
        r.push([_0.pop()])
        break s
      case v.and:
        break s
      case v.close:
        _0 = r.pop()
        _1 = r.pop()
        _1.push(_0)
        r.push(_1)
        break s
      default:
        _0 = r.pop()
        _0.push(v.identifier)
        r.push(_0)
        break s
    }
  }
  return r[0]
}

function *flatten(t) {
  switch (t?.constructor) {
    case Array:
      if (t.length === 1) // special case for empty dir
        yield [t[0], ""]
      else
        for (const child of t.slice(1))
          for (const path of flatten(child))
            yield [t[0], ...path]
      break
    default:
      yield [t]
  }
}

const program =
  "bucket[f1[a&b&f[]&ff1[a&b]]&f2[a&b&ff1[a&b]]&f3[ff1[a]]]"

for (const path of flatten(parse(lex(program))))
  console.log(path.join("/"))

.as-console-wrapper { min-height: 100%; top: 0; }

changing the base path

Did you notice parse accepts a second argument, base? You can specify a base path and it will use this when interpreting all paths in the input program. Below we call parse with /usr/local as the base -

for (const path of flatten(parse(lex(program), "/usr/local")))
  console.log(path.join("/"))

/usr/local/bucket/f1/a
/usr/local/bucket/f1/b
/usr/local/bucket/f1/f/
/usr/local/bucket/f1/ff1/a
/usr/local/bucket/f1/ff1/b
/usr/local/bucket/f2/a
/usr/local/bucket/f2/b
/usr/local/bucket/f2/ff1/a
/usr/local/bucket/f2/ff1/b
/usr/local/bucket/f3/ff1/a

We could give a relative path too -

for (const path of flatten(parse(lex(program), ".")))
  console.log(path.join("/"))

./bucket/f1/a
./bucket/f1/b
./bucket/f1/f/
./bucket/f1/ff1/a
./bucket/f1/ff1/b
./bucket/f2/a
./bucket/f2/b
./bucket/f2/ff1/a
./bucket/f2/ff1/b
./bucket/f3/ff1/a

visualizing parse

How does parse turn a linear list of tokens into a hierarchical tree? Below we visualize the steps taken for each token -

token	state
initial state	[[.]]
{identifier: bucket}	[[., bucket]]
{open}	[[.], [bucket]]
{identifier: f1}	[[.], [bucket, f1]]
{open}	[[.], [bucket], [f1]]
{identifier: a}	[[.], [bucket], [f1, a]]
{identifier: b}	[[.], [bucket], [f1, a, b]]
{identifier: f}	[[.], [bucket], [f1, a, b, f]]
{open}	[[.], [bucket], [f1, a, b], [f]]
{close}	[[.], [bucket], [f1, a, b, [f]]]
{identifier: ff1}	[[.], [bucket], [f1, a, b, [f], ff1]]
{open}	[[.], [bucket], [f1, a, b, [f]], [ff1]]
{identifier: a}	[[.], [bucket], [f1, a, b, [f]], [ff1, a]]
{identifier: b}	[[.], [bucket], [f1, a, b, [f]], [ff1, a, b]]
{close}	[[.], [bucket], [f1, a, b, [f], [ff1, a, b]]]
{close}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]]]
{identifier: f2}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], f2]]
{open}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2]]
{identifier: a}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a]]
{identifier: b}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b]]
{identifier: ff1}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b, ff1]]
{open}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b], [ff1]]
{identifier: a}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b], [ff1, a]]
{identifier: b}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b], [ff1, a, b]]
{close}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]]], [f2, a, b, [ff1, a, b]]]
{close}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]]]
{identifier: f3}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]], f3]]
{open}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]], [f3]]
{identifier: ff1}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]], [f3, ff1]]
{open}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]], [f3], [ff1]]
{identifier: a}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]], [f3], [ff1, a]]
{close}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]]], [f3, [ff1, a]]]
{close}	[[.], [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]], [f3, [ff1, a]]]]
{close}	[[., [bucket, [f1, a, b, [f], [ff1, a, b]], [f2, a, b, [ff1, a, b]], [f3, [ff1, a]]]]]

homework

Our code above works but there are some scenarios we should address to make our program more robust.

lex -

How would we handle invalid syntax?
How would we include the character position offset for each token?

parse -

Why is it safe to ignore the & tokens?
How would we throw an error for unexpected [
How would we throw an error for unexpected ]
How would we limit the maximum depth of the generated file tree?
Where would we throw an error for tokens of invalid type?

CodePudding user response：

Another approach

There is already a fantastic answer from Mulan. You can learn a great deal from that code, and I would suggest studying it carefully if you don't have much experience writing parsers.

But there are many ways to solve such problems, and here we examine an entirely different approach. It can't teach as much as Mulan's answer, but there is a useful technique that might be new to some.

We'll use a different example from yours for discussion, one with longer and more familiar node names. We want to convert this:

'root[bin&export[home[user1&user2&user3]]&lib&usr[ccs&lib]]'

into this:

[
  'root/bin',
  'root/export/home/user1',
  'root/export/home/user2',
  'root/export/home/user3',
  'root/lib',
  'root/usr/ccs',
  'root/usr/lib'
]

Mulan's answer wrote a parser for this mini-language. Here we do a series of transforms that ends in

'root/bin&root/export/home/user1&root/export/home/user2&root/export/home/user3&root/lib&root/usr/ccs&root/usr/lib'

and then simply split the result at every '&' to get your requested output. The trick is that we work with one bracket-pair at a time, always one that nests no further brackets inside it. When we've replaced that, we scan the new string for any further unnested brackets, and continue recursively until there are none to be found. The steps will look like this:

'root[bin&export[home[user1&user2&user3]]&lib&usr[ccs&lib]]'
'root[bin&export[home/user1&home/user2&home/user3]&lib&usr[ccs&lib]]'
'root[bin&export/home/user1&export/home/user2&export/home/user3&lib&usr[ccs&lib]]'
'root[bin&export/home/user1&export/home/user2&export/home/user3&lib&usr/ccs&usr/lib]'
'root/bin&root/export/home/user1&root/export/home/user2&root/export/home/user3&root/lib&root/usr/ccs&root/usr/lib'

Implementation

We do this using a regular expression that captures a pair of braces with no braces inside and the preceding string that applies to them:

root[bin&export[home[user1&user2&user3]]&lib&usr[ccs&lib]]
               |    |_________________|`--- closing brace (']')
               |    |        `-------- ---- parts (['user1', 'user2', 'user3'])
               |    |`---------------- ---- opening brace ('[')
               |____|                 |
               |  `------------------- ---- prefix ('home')
               |______________________|
                          `---------------- full match ('[home[user1&user2&user3]')

Then we replace the string we've found, joining each of the words inside the brackets with the prefix and a '/' to yield 'home/user1&home/user2&home/user3'. Then we recur on this new value, ending when there is no match.

The code looks like this:

const brackets = /([^\[\]\&] )\[([^\[\]]*)\]/

const expand = (s) =>
  brackets .test (s)
    ? expand (s .replace (
        brackets, 
        (_, prefix, ps) => ps .split ('&') .map (p => prefix   '/'   p) .join ('&')
      ))
    : s

const parse = (s) =>
  expand (s) .split ('&') 


console .log (parse ('root[bin&export[home[user1&user2&user3]]&lib&usr[ccs&lib]]'))
console .log (parse ('bucket[f1[ab&f[]&ff1[a&b]]&f2[a&b&ff1[a&b]]&f3[ff1[a]]]'))

.as-console-wrapper {max-height: 100% !important; top: 0}

Notes

Our running example does not demonstrate the empty folders, but we show the example from the question to demonstrate that it does as required. No special processing turns out to be necessary for this.
This is recursive, and could run into recursion-depth problems if you're using this for large inputs. It would be a simple, mechanical process to turn this into a while loop.
Regular expressions are very powerful, but often difficult to understand. I personally find them quite worth the effort. You can find an explanation of this regex on Regex101.com. But when my first attempt at a regex solution failed, I wrote a mostly-equivalent solution without the regex. It might be instructive to compare that imperative code with this version. After writing it, I was able to recognize what I'd done wrong with my initial regular expression and write this cleaner version.
We do nothing here to capture bad input. It shouldn't be hard to do so here, but we leave it as an exercise for the reader. (Hint: it seems possible that any bad input would necessarily leave some brackets behind after the expansion.)