Expanding numeric range that sometimes occurs between letters-CodePudding

I currently have a CSV file that is formatted like the one below where the first column specifies a numeric range of numbers and the second column indicates what category/grouping any number in that range belong to. I would like to expand this out so there is one row per number and its categories, rather than ranges (see the desired output).

A couple of notes, 1) - the ranges are still indicated for ranges that are length 1 (61320-61320). 2) - Sometimes the range begins or ends with a letter (S5342 or 0480T). In this instance, the letter should be retained and the numeric range between the example expanded.

Currently I'm reading this in with Pandas, but open to any solution

Current CSV

Range	Category
61320-61320	Apple
51320-51322	Orange
M1148-M1149	Pear

Desired output

Value	Category
61320	Apple
51320	Orange
51321	Orange
51322	Orange
M1148	Pear
M1149	Pear

CodePudding user response：

Solution

Here's one approach. I'm still working on something better, I'm not super happy with this, but your update really threw a wrench in the works :D

import re


p1 = re.compile(r"^(\d*)-(\d*)$")
p2 = re.compile(r"([a-z])(\d*)-(?:[a-z])(\d*)", re.IGNORECASE)
p3 = re.compile(r"(\d*)(?:[a-z])-(\d*)([a-z])", re.IGNORECASE)
PATTERNS = (p1, p2, p3)


def make_ranges(s):
    for p in PATTERNS:
        match = p.findall(s)
        if not match:
            continue
        match = match[0]
        if len(match) == 2:
            x, y = match
            label = ""
            result = "{value}{label}"
        else:
            if match[-1].isdigit():
                label, x, y = match
                result = "{label}{value}"
            else:
                x, y, label = match
                result = "{value}{label}"
        x, y = int(x), int(y)
        return [result.format(label=label, value=val) for val in range(x, y   1)]



df.Range = df.Range.apply(make_ranges)
df = df.explode("Range").reset_index(drop=True)

Steps

Starting with a dataframe like so:

In [3]: df
Out[3]:
         Range   Category
0  61320-61320      Apple
1  51320-51322     Orange
2  M1148-M1149       Pear
3    123X-127X    Kumquat

Using regex, this is pretty straightforward:

In [4]: df
Out[4]:
         Range Category
0  61320-61320    Apple
1  51320-51322   Orange
2  M1148-M1149     Pear
3    123X-127X  Kumquat

In [5]: df.Range = df.Range.apply(make_ranges)

In [6]: df
Out[6]:
                            Range Category
0                         [61320]    Apple
1           [51320, 51321, 51322]   Orange
2                  [M1148, M1149]     Pear
3  [123X, 124X, 125X, 126X, 127X]  Kumquat

In [7]: df = df.explode("Range").reset_index(drop=True)

In [8]: df
Out[8]:
    Range Category
0   61320    Apple
1   51320   Orange
2   51321   Orange
3   51322   Orange
4   M1148     Pear
5   M1149     Pear
6    123X  Kumquat
7    124X  Kumquat
8    125X  Kumquat
9    126X  Kumquat
10   127X  Kumquat

From here you can rename the Range column if you want.

CodePudding user response：

This works if the letters are at the start of the string, and uses a helper function from pyjanitor (which you can totally avoid, and just do a reindex, - it just makes things convenient) :

# pip install pyjanitor
import pandas as pd
import janitor

(df.assign(Value = df.Range.str.split('-'))
   .explode('Value')
   .assign(tempo = lambda df: df.Value
                                .str.extract('(\d )', expand=False)
                                .astype(int))
   # the complete function explicitly exposes the missing rows
   .complete(dict(tempo=lambda df: range(df.min(), df.max() 1)),
            'Category', 
            by = 'Range', 
            sort = True)
.bfill(axis=1)
.filter(['Value', 'Category'])
.drop_duplicates()
)
   Value Category
0  51320   Orange
1  51321   Orange
2  51322   Orange
3  61320    Apple
5  M1148     Pear
6  M1149     Pear

For a more performant option, which should help for large data, a bit more steps is involved :

temp  = df.assign(Value = df.Range.str.split('-')).explode('Value')
temp = temp.assign(tempo = temp.Value.str.extract('(\d )', 
                                                  expand=False)
           .astype(int))

# build a mapping of all the integers, without a groupby
 ranges = dict(tempo = range(temp.tempo.min(), temp.tempo.max()   1))

# get additional columns for min and max of the integers
grouped = temp.groupby('Range').tempo
temp = temp.assign(min_range = grouped.transform('min'), 
                   max_range = grouped.transform('max'))

# build the dataframe, with explicit rows, using pyjanitor's complete
# then filter for integers within min and max range
temp = (temp.complete(ranges, 
                     ('Category', 'Range', 'Value', 'min_range', 'max_range'))
            .query('min_range <= tempo <= max_range')
            .drop(columns=['min_range', 'max_range', 'Range'])
         )

# Create additional columns for start and end letters, if any:
temp = temp.assign(first = temp.Value.str.extract(r"^([A-Z])").fillna(""), 
                   last = temp.Value.str.extract(r"([A-Z])$").fillna(""))

# combine the columns to recreate the `Value` column:
(temp.assign(Value = temp['first']   
                     temp.tempo.astype(str)   
                     temp['last'])
     .filter(['Value', 'Category'])
     .drop_duplicates()
)
        Value Category
0       61320    Apple
2       51320   Orange
3       51322   Orange
4       M1148     Pear
5       M1149     Pear
250869  51321   Orange


def more_steps(df):
    temp  = (df.assign(Value = df.Range.str.split('-'))
               .explode('Value'))

    temp = temp.assign(tempo = temp.Value.str.extract('(\d )', 
                                                  expand=False)
               .astype(int))

    ranges = dict(tempo = range(temp.tempo.min(), 
                                temp.tempo.max()   1))

    grouped = temp.groupby('Range').tempo

    temp = temp.assign(min_range = grouped.transform('min'), 
                       max_range = grouped.transform('max'))

    temp = (temp.complete(ranges, 
                         ('Category', 'Range', 
                          'Value', 'min_range', 
                          'max_range'))
                .query('min_range <= tempo <= max_range')
                .drop(columns=['min_range', 'max_range', 'Range'])
            )

    temp = temp.assign(first = temp.Value.str.extract(r"^([A-Z])").fillna(""), 
                       last = temp.Value.str.extract(r"([A-Z])$").fillna(""))

    return (temp.assign(Value = temp['first']   
                        temp.tempo.astype(str)   
                        temp['last'])
                .filter(['Value', 'Category'])
               .drop_duplicates()
           )