I am interested in reading the observational data and wearable sensor data available here into Python. Specifically, I would like to get them into Pandas dataframes, but even getting them into a more familiar form would effectively answer the question.
Both files are *.txt.gz
files. I have tried to read them like this:
import gzip
with gzip.open('../data/OBS_data.txt.gz', 'rb') as f:
file_content=f.read()
print(file_content)
But it is clear from printing the file contents that it is in some sort of encoding. I tried converting it a utf-8
string unsuccesfully with
file_content.decode("utf-8")
But this gives the error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In [15], line 4
1 with gzip.open('../data/OBS_data.txt.gz', 'r') as f:
2 file_content=f.read()
----> 4 print(file_content.decode("utf-8"))
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
I also tried using Pandas directly:
df = pd.read_csv('../data/OBS_data.txt.gz', compression='gzip')
But that gives a similar error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In [17], line 1
----> 1 df = pd.read_csv('../data/OBS_data.txt.gz', compression='gzip')
File /usr/lib/python3/dist-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:586, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
571 kwds_defaults = _refine_defaults_read(
572 dialect,
573 delimiter,
(...)
582 defaults={"delimiter": ","},
583 )
584 kwds.update(kwds_defaults)
--> 586 return _read(filepath_or_buffer, kwds)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:482, in _read(filepath_or_buffer, kwds)
479 _validate_names(kwds.get("names", None))
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
484 if chunksize or iterator:
485 return parser
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:811, in TextFileReader.__init__(self, f, engine, **kwds)
808 if "has_index_names" in kwds:
809 self.options["has_index_names"] = kwds["has_index_names"]
--> 811 self._engine = self._make_engine(self.engine)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1040, in TextFileReader._make_engine(self, engine)
1036 raise ValueError(
1037 f"Unknown engine: {engine} (valid options are {mapping.keys()})"
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options)
File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:69, in CParserWrapper.__init__(self, src, **kwds)
67 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
68 try:
---> 69 self._reader = parsers.TextReader(self.handles.handle, **kwds)
70 except Exception:
71 self.handles.close()
File /usr/lib/python3/dist-packages/pandas/_libs/parsers.pyx:542, in pandas._libs.parsers.TextReader.__cinit__()
File /usr/lib/python3/dist-packages/pandas/_libs/parsers.pyx:642, in pandas._libs.parsers.TextReader._get_header()
File /usr/lib/python3/dist-packages/pandas/_libs/parsers.pyx:843, in pandas._libs.parsers.TextReader._tokenize_rows()
File /usr/lib/python3/dist-packages/pandas/_libs/parsers.pyx:1917, in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
So I may have misunderstood the encoding.
How do I load this data?
Strangely, this works
df2 = pd.read_csv('http://www.sociopatterns.org/wp-content/uploads/2020/12/RFID_data.txt.gz', sep='\t')
print(df2.head())
t i j DateTime
0 1560396500 ARIELLE FANA 13/06/2019 05:28
1 1560396500 ARIELLE VIOLETTE 13/06/2019 05:28
2 1560396520 FANA HARLEM 13/06/2019 05:28
3 1560396540 FELIPE ANGELE 13/06/2019 05:29
4 1560396540 ARIELLE FANA 13/06/2019 05:29
but this doesn't
df = pd.read_csv('http://www.sociopatterns.org/wp-content/uploads/2020/12/OBS_data.txt.gz', sep='\t')
print(df.head())
Python 3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> df = pd.read_csv('http://www.sociopatterns.org/wp-content/uploads/2020/12/OBS_data.txt.gz', sep='\t')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3/dist-packages/pandas/util/_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py", line 482, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py", line 811, in __init__
self._engine = self._make_engine(self.engine)
File "/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py", line 1040, in _make_engine
return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
File "/usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py", line 69, in __init__
self._reader = parsers.TextReader(self.handles.handle, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 642, in pandas._libs.parsers.TextReader._get_header
File "pandas/_libs/parsers.pyx", line 843, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 1917, in pandas._libs.parsers.raise_parser_error
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
My version of pandas
is 1.3.5
, and here is my OS (pretty fresh install; last week):
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 22.04.1 LTS
Release: 22.04
Codename: jammy
Reinstalling pandas
didn't work. I tried removing all 3rd party Python packages, then reinstalling pandas using pip
and it still didn't work.
CodePudding user response:
You're over-complicating things, pandas.read_csv
will read zipped files without having to unzip them.~
df = pd.read_csv('http://www.sociopatterns.org/wp-content/uploads/2020/12/OBS_data.txt.gz', sep='\t')
print(df.head())
df2 = pd.read_csv('http://www.sociopatterns.org/wp-content/uploads/2020/12/RFID_data.txt.gz', sep='\t')
print(df2.head())
Output:
t i j DateTime
0 1560396500 ARIELLE FANA 13/06/2019 05:28
1 1560396500 ARIELLE VIOLETTE 13/06/2019 05:28
2 1560396520 FANA HARLEM 13/06/2019 05:28
3 1560396540 FELIPE ANGELE 13/06/2019 05:29
4 1560396540 ARIELLE FANA 13/06/2019 05:29
DateTime Actor Recipient Behavior Category Duration Point
0 13/06/2019 09:35 EWINE NaN Invisible Other 34 NO
1 13/06/2019 09:35 EWINE NaN Other Other 21 NO
2 13/06/2019 09:35 EWINE NaN Invisible Other 42 NO
3 13/06/2019 09:36 EWINE NaN Other Other 2 NO
4 13/06/2019 09:36 EWINE NaN Invisible Other 30 NO
If downloaded already:
df = pd.read_csv('../data/OBS_data.txt.gz', sep='\t')