sophiamyang / condastats Goto Github PK
View Code? Open in Web Editor NEWLicense: Other
License: Other
I am trying to use the overall() function in a jupyterlab notebook to get download data for a number of packages but I keep getting KeyError: 'statistics'. I get the same error when I run it in the terminal. Any chance you know why I'm getting this error? Thank you!
Just like in the tutorial, I tried this command: overall(['pandas','dask']) and I get the following error and traceback:
KeyError Traceback (most recent call last)
<ipython-input-19-132bed5942ba> in <module>
----> 1 overall(['pandas','dask'])
~/opt/anaconda3/lib/python3.8/site-packages/condastats/cli.py in overall(package, month, start_month, end_month, monthly, complete, pkg_platform, data_source, pkg_version, pkg_python)
59 # if all optional arguments are None, read in
60 # all the data for a certain package
---> 61 df = dd.read_parquet(
62 "s3://anaconda-package-data/conda/monthly/*/*.parquet",
63 storage_options={"anon": True},
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, read_from_paths, chunksize, **kwargs)
305 index = [index]
306
--> 307 read_metadata_result = engine.read_metadata(
308 fs,
309 paths,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/fastparquet.py in read_metadata(cls, fs, paths, categories, index, gather_statistics, filters, split_row_groups, **kwargs)
691
692 # Break `pf` into a list of `parts`
--> 693 parts, stats = cls._construct_parts(
694 fs,
695 pf,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/fastparquet.py in _construct_parts(cls, fs, pf, paths, parts, dtypes, base_path, filters, index_cols, categories, split_row_groups, gather_statistics)
647
648 # Process row-groups and return `(parts, stats)`
--> 649 return cls._process_metadata(
650 pf,
651 dtypes,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/fastparquet.py in _process_metadata(cls, pf, dtypes, split_row_groups, gather_statistics, stat_col_indices, filters, categories, base_path, paths, fs)
591
592 # Convert organized row-groups to parts
--> 593 parts, stats = _row_groups_to_parts(
594 gather_statistics,
595 split_row_groups,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/utils.py in _row_groups_to_parts(gather_statistics, split_row_groups, file_row_groups, file_row_group_stats, file_row_group_column_stats, stat_col_indices, make_part_func, make_part_kwargs)
603 for filename, row_groups in file_row_groups.items():
604
--> 605 part = make_part_func(
606 filename,
607 row_groups,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/fastparquet.py in _make_part(cls, filename, rg_list, fs, pf, base_path, partitions)
543
544 if partitions:
--> 545 real_row_groups = cls._get_thrift_row_groups(
546 pf,
547 filename,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/fastparquet.py in _get_thrift_row_groups(cls, pf, filename, row_groups)
509 for rg, rg_global in row_groups:
510 row_group = pf.row_groups[rg_global]
--> 511 row_group.statistics = None
512 row_group.helper = None
513 for c, col in enumerate(row_group.columns):
~/opt/anaconda3/lib/python3.8/site-packages/fastparquet/cencoding.pyx in fastparquet.cencoding.ThriftObject.__setattr__()
KeyError: 'statistics'
When making a request, I have the following PermissionError.
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/s3fs/core.py", line 110, in _error_wrapper
return await func(*args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/aiobotocore/client.py", line 265, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/bin/condastats", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.8/site-packages/condastats/cli.py", line 387, in main
overall(
File "/usr/local/lib/python3.8/site-packages/condastats/cli.py", line 87, in overall
df = df.compute()
File "/usr/local/lib/python3.8/site-packages/dask/base.py", line 315, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/local/lib/python3.8/site-packages/dask/base.py", line 598, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/local/lib/python3.8/site-packages/dask/threaded.py", line 89, in get
results = get_async(
File "/usr/local/lib/python3.8/site-packages/dask/local.py", line 511, in get_async
raise_exception(exc, tb)
File "/usr/local/lib/python3.8/site-packages/dask/local.py", line 319, in reraise
raise exc
File "/usr/local/lib/python3.8/site-packages/dask/local.py", line 224, in execute_task
result = _execute_task(task, data)
File "/usr/local/lib/python3.8/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/usr/local/lib/python3.8/site-packages/dask/optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "/usr/local/lib/python3.8/site-packages/dask/core.py", line 149, in get
result = _execute_task(task, cache)
File "/usr/local/lib/python3.8/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 89, in __call__
return read_parquet_part(
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 587, in read_parquet_part
dfs = [
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 588, in <listcomp>
func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/arrow.py", line 435, in read_partition
arrow_table = cls._read_table(
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/arrow.py", line 1518, in _read_table
arrow_table = _read_table_from_path(
File "/usr/local/lib/python3.8/site-packages/dask/dataframe/io/parquet/arrow.py", line 239, in _read_table_from_path
return pq.ParquetFile(fil, **pre_buffer).read(
File "/usr/local/lib/python3.8/site-packages/pyarrow/parquet/__init__.py", line 277, in __init__
self.reader.open(
File "pyarrow/_parquet.pyx", line 1213, in pyarrow._parquet.ParquetReader.open
File "/usr/local/lib/python3.8/site-packages/fsspec/spec.py", line 1578, in read
out = self.cache._fetch(self.loc, self.loc + length)
File "/usr/local/lib/python3.8/site-packages/fsspec/caching.py", line 41, in _fetch
return self.fetcher(start, stop)
File "/usr/local/lib/python3.8/site-packages/s3fs/core.py", line 2030, in _fetch_range
return _fetch_range(
File "/usr/local/lib/python3.8/site-packages/s3fs/core.py", line 2173, in _fetch_range
resp = fs.call_s3(
File "/usr/local/lib/python3.8/site-packages/fsspec/asyn.py", line 86, in wrapper
return sync(self.loop, func, *args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/fsspec/asyn.py", line 66, in sync
raise return_result
File "/usr/local/lib/python3.8/site-packages/fsspec/asyn.py", line 26, in _runner
result[0] = await coro
File "/usr/local/lib/python3.8/site-packages/s3fs/core.py", line 332, in _call_s3
return await _error_wrapper(
File "/usr/local/lib/python3.8/site-packages/s3fs/core.py", line 137, in _error_wrapper
raise err
PermissionError: Access Denied
import condastats.cli
condastats.cli.overall('numpy')
I could not reopen the previous issue since I did not close it myself. Thanks!
I wanted to collect some statistics of a package with condastats but encountered the error.
Describe what you were trying to get done.
Tell us what happened, what went wrong, and what you expected to happen.
$ conda install -c conda-forge condastats
$ condastats overall pandas
ValueError: Not all columns are categoricals
Is there something I am doing wrong?
I am not seeing any download stats for python 3.10. Based on the numbers it looks like 3.10 may be accounted towards 3.1.
# condastats pkg_python pandas --start_month=2022-07 --end_month=2022-07
pkg_name pkg_python
pandas 3.6 255104
3.8 449448
3.9 482697
2.7 27661
3.5 39544
3.7 701528
3.1 155171
3.3 388
3.4 1321
2.6 588
Name: counts, dtype: int64
condastats shows an exponential increase in downloads over the last few months. While we're confident in the quality of our package ;-), this seems unrealistic and, in any case, unexpected (*100 between 2023/12 and 2024/05 !).
Do you have any idea why these variations are occurring?
condastats overall pyagrum --monthly
[...]
2023-08 2484
2023-09 2433
2023-10 4560
2023-11 3154
2023-12 1114
2024-01 2829
2024-02 2812
2024-03 12573
2024-04 66098
2024-05 110944
Thank you for any hints, explanation or information on this subject
The S3 server cannot be accessed.
(venv) tmr@niphredil:~/PycharmProjects/pdfo.github.io/static/scripts/stats$ condastats overall pdfo
Traceback (most recent call last):
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/s3fs/core.py", line 261, in _call_s3
out = await method(**additional_kwargs)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/aiobotocore/client.py", line 180, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/bin/condastats", line 8, in <module>
sys.exit(main())
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/condastats/cli.py", line 382, in main
overall(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/condastats/cli.py", line 85, in overall
df = df.compute()
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/base.py", line 288, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/base.py", line 571, in compute
results = schedule(dsk, keys, **kwargs)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/threaded.py", line 79, in get
results = get_async(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/local.py", line 507, in get_async
raise_exception(exc, tb)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/local.py", line 315, in reraise
raise exc
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/local.py", line 220, in execute_task
result = _execute_task(task, data)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/optimization.py", line 969, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 149, in get
result = _execute_task(task, cache)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 113, in _execute_task
return [_execute_task(a, cache) for a in arg]
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 113, in <listcomp>
return [_execute_task(a, cache) for a in arg]
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 87, in __call__
return read_parquet_part(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 422, in read_parquet_part
dfs = [
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 423, in <listcomp>
func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 953, in read_partition
parquet_file = ParquetFile(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fastparquet/api.py", line 109, in __init__
basepath, fmd = metadata_from_many(fn, verify_schema=verify,
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fastparquet/util.py", line 155, in metadata_from_many
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fastparquet/util.py", line 155, in <listcomp>
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fastparquet/api.py", line 133, in __init__
self._parse_header(f, verify)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fastparquet/api.py", line 176, in _parse_header
head_size = struct.unpack('<i', f.read(4))[0]
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fsspec/spec.py", line 1565, in read
out = self.cache._fetch(self.loc, self.loc + length)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fsspec/caching.py", line 377, in _fetch
self.cache = self.fetcher(start, bend)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/s3fs/core.py", line 1896, in _fetch_range
return _fetch_range(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/s3fs/core.py", line 2038, in _fetch_range
resp = fs.call_s3(
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fsspec/asyn.py", line 91, in wrapper
return sync(self.loop, func, *args, **kwargs)
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fsspec/asyn.py", line 71, in sync
raise return_result
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/fsspec/asyn.py", line 25, in _runner
result[0] = await coro
File "/home/tmr/PycharmProjects/pdfo.github.io/static/scripts/stats/venv/lib/python3.9/site-packages/s3fs/core.py", line 281, in _call_s3
raise err
PermissionError: Access Denied
Hey @sophiamyang, would you mind allowing me take this off your plate and move it into the conda-incubator organization?
That'd allow us to help with things like #20 etc. Thank you!
After installing condastats
using conda install -c conda-forge condastats
[OK], running condastats leads to the following error:
$ condastats overall numpy
Traceback (most recent call last):
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/registry.py", line 219, in get_filesystem_class
register_implementation(protocol, _import_class(bit["class"]))
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/registry.py", line 242, in _import_class
mod = importlib.import_module(mod)
File "/home/hillairet/miniconda3/lib/python3.8/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 843, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/s3fs/__init__.py", line 1, in <module>
from .core import S3FileSystem, S3File
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/s3fs/core.py", line 12, in <module>
from fsspec.asyn import AsyncFileSystem, sync, sync_wrapper, maybe_sync
ImportError: cannot import name 'maybe_sync' from 'fsspec.asyn' (/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/asyn.py)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/hillairet/miniconda3/bin/condastats", line 10, in <module>
sys.exit(main())
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/condastats/cli.py", line 382, in main
overall(
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/condastats/cli.py", line 61, in overall
df = dd.read_parquet(
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 298, in read_parquet
fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options)
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/core.py", line 636, in get_fs_token_paths
cls = get_filesystem_class(protocol)
File "/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/registry.py", line 221, in get_filesystem_class
raise ImportError(bit["err"]) from e
ImportError: Install s3fs to access S3
despite that s3fs
and fsspec
packages are already installed:
$ conda list s3fs
# Name Version Build Channel
s3fs 0.5.1 py_0 conda-forge
fsspec 2021.10.0 pyhd8ed1ab_0 conda-forge
The problem seems to come from changes in fsspec
:
>>> from fsspec.asyn import maybe_sync
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ImportError: cannot import name 'maybe_sync' from 'fsspec.asyn' (/home/hillairet/miniconda3/lib/python3.8/site-packages/fsspec/asyn.py)
name: condastats
channels:
- conda-forge
dependencies:
- aiobotocore=2.12.2=pyhd8ed1ab_0
- aiohttp=3.9.5=py311h05b510d_0
- aioitertools=0.11.0=pyhd8ed1ab_0
- aiosignal=1.3.1=pyhd8ed1ab_0
- attrs=23.2.0=pyh71513ae_0
- aws-c-auth=0.7.18=h382b9c6_0
- aws-c-cal=0.6.11=hd34e5fa_0
- aws-c-common=0.9.15=h93a5062_0
- aws-c-compression=0.2.18=hd34e5fa_3
- aws-c-event-stream=0.4.2=h247c08a_8
- aws-c-http=0.8.1=hf9e830b_10
- aws-c-io=0.14.7=h33d81b3_6
- aws-c-mqtt=0.10.4=h5f4abda_0
- aws-c-s3=0.5.7=h7644b7e_2
- aws-c-sdkutils=0.1.15=hd34e5fa_3
- aws-checksums=0.1.18=hd34e5fa_3
- aws-crt-cpp=0.26.8=h7541583_2
- aws-sdk-cpp=1.11.267=h18943f6_7
- bokeh=3.4.1=pyhd8ed1ab_0
- botocore=1.34.51=pyge310_1234567_0
- brotli-python=1.1.0=py311ha891d26_1
- bzip2=1.0.8=h93a5062_5
- c-ares=1.28.1=h93a5062_0
- ca-certificates=2024.2.2=hf0a4a13_0
- click=8.1.7=unix_pyh707e725_0
- cloudpickle=3.0.0=pyhd8ed1ab_0
- condastats=0.2.1=pyhd8ed1ab_0
- contourpy=1.2.1=py311hcc98501_0
- cramjam=2.8.3=py311h94f323b_0
- cytoolz=0.12.3=py311h05b510d_0
- dask=2024.2.1=pyhd8ed1ab_0
- dask-core=2024.2.1=pyhd8ed1ab_1
- distributed=2024.2.1=pyhd8ed1ab_0
- freetype=2.12.1=hadb7bae_2
- frozenlist=1.4.1=py311h05b510d_0
- fsspec=2024.3.1=pyhca7485f_0
- gflags=2.2.2=hc88da5d_1004
- glog=0.7.0=hc6770e3_0
- icu=73.2=hc8870d7_0
- idna=3.7=pyhd8ed1ab_0
- importlib-metadata=7.1.0=pyha770c72_0
- importlib_metadata=7.1.0=hd8ed1ab_0
- jinja2=3.1.3=pyhd8ed1ab_0
- jmespath=1.0.1=pyhd8ed1ab_0
- krb5=1.21.2=h92f50d5_0
- lcms2=2.16=ha0e7c42_0
- lerc=4.0.0=h9a09cb3_0
- libabseil=20240116.2=cxx17_hebf3989_0
- libarrow=15.0.2=hea125af_6_cpu
- libarrow-acero=15.0.2=h3f3aa29_6_cpu
- libarrow-dataset=15.0.2=h3f3aa29_6_cpu
- libarrow-flight=15.0.2=h224147a_6_cpu
- libarrow-flight-sql=15.0.2=hb630850_6_cpu
- libarrow-gandiva=15.0.2=h3b9069c_6_cpu
- libarrow-substrait=15.0.2=hd92e347_6_cpu
- libblas=3.9.0=22_osxarm64_openblas
- libbrotlicommon=1.1.0=hb547adb_1
- libbrotlidec=1.1.0=hb547adb_1
- libbrotlienc=1.1.0=hb547adb_1
- libcblas=3.9.0=22_osxarm64_openblas
- libcrc32c=1.1.2=hbdafb3b_0
- libcurl=8.7.1=h2d989ff_0
- libcxx=16.0.6=h4653b0c_0
- libdeflate=1.20=h93a5062_0
- libedit=3.1.20191231=hc8eb9b7_2
- libev=4.33=h93a5062_2
- libevent=2.1.12=h2757513_1
- libexpat=2.6.2=hebf3989_0
- libffi=3.4.2=h3422bc3_5
- libgfortran=5.0.0=13_2_0_hd922786_3
- libgfortran5=13.2.0=hf226fd6_3
- libgoogle-cloud=2.23.0=hbebe991_1
- libgoogle-cloud-storage=2.23.0=h8a76758_1
- libgrpc=1.62.2=h9c18a4f_0
- libiconv=1.17=h0d3ecfb_2
- libjpeg-turbo=3.0.0=hb547adb_1
- liblapack=3.9.0=22_osxarm64_openblas
- libllvm16=16.0.6=haab561b_3
- libnghttp2=1.58.0=ha4dd798_1
- libopenblas=0.3.27=openmp_h6c19121_0
- libparquet=15.0.2=h5304c63_6_cpu
- libpng=1.6.43=h091b4b1_0
- libprotobuf=4.25.3=hbfab5d5_0
- libre2-11=2023.09.01=h7b2c953_2
- libsqlite=3.45.3=h091b4b1_0
- libssh2=1.11.0=h7a5bd25_0
- libthrift=0.19.0=h026a170_1
- libtiff=4.6.0=h07db509_3
- libutf8proc=2.8.0=h1a8c8d9_0
- libwebp-base=1.4.0=h93a5062_0
- libxcb=1.15=hf346824_0
- libxml2=2.12.6=h0d0cfa8_2
- libzlib=1.2.13=h53f4e23_5
- llvm-openmp=18.1.5=hde57baf_0
- locket=1.0.0=pyhd8ed1ab_0
- lz4=4.3.3=py311hd44b8e9_0
- lz4-c=1.9.4=hb7217d7_0
- markupsafe=2.1.5=py311h05b510d_0
- msgpack-python=1.0.7=py311hd03642b_0
- multidict=6.0.5=py311he2be06e_0
- ncurses=6.4.20240210=h078ce10_0
- numpy=1.26.4=py311h7125741_0
- openjpeg=2.5.2=h9f1df11_0
- openssl=3.3.0=h0d3ecfb_0
- orc=2.0.0=h4aad248_1
- packaging=24.0=pyhd8ed1ab_0
- pandas=1.5.3=py311h4eec4a9_1
- partd=1.4.1=pyhd8ed1ab_0
- pillow=10.3.0=py311h0b5d0a1_0
- pip=24.0=pyhd8ed1ab_0
- psutil=5.9.8=py311h05b510d_0
- pthread-stubs=0.4=h27ca646_1001
- pyarrow=15.0.2=py311h5ff715f_6_cpu
- pyarrow-hotfix=0.6=pyhd8ed1ab_0
- pysocks=1.7.1=pyha2e5f31_6
- python=3.11.9=h932a869_0_cpython
- python-dateutil=2.9.0=pyhd8ed1ab_0
- python-snappy=0.7.1=pyh48db8ab_1
- python-tzdata=2024.1=pyhd8ed1ab_0
- python_abi=3.11=4_cp311
- pytz=2024.1=pyhd8ed1ab_0
- pyyaml=6.0.1=py311heffc1b2_1
- re2=2023.09.01=h4cba328_2
- readline=8.2=h92ec313_1
- s3fs=2024.3.1=pyhd8ed1ab_0
- setuptools=69.5.1=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- snappy=1.2.0=hd04f947_1
- sortedcontainers=2.4.0=pyhd8ed1ab_0
- tblib=3.0.0=pyhd8ed1ab_0
- tk=8.6.13=h5083fa2_1
- toolz=0.12.1=pyhd8ed1ab_0
- tornado=6.4=py311h05b510d_0
- typing_extensions=4.11.0=pyha770c72_0
- tzdata=2024a=h0c530f3_0
- urllib3=2.0.7=pyhd8ed1ab_0
- wheel=0.43.0=pyhd8ed1ab_1
- wrapt=1.16.0=py311h05b510d_0
- xorg-libxau=1.0.11=hb547adb_0
- xorg-libxdmcp=1.1.3=h27ca646_0
- xyzservices=2024.4.0=pyhd8ed1ab_0
- xz=5.2.6=h57fd34a_0
- yaml=0.2.5=h3422bc3_2
- yarl=1.9.4=py311h05b510d_0
- zict=3.0.0=pyhd8ed1ab_0
- zipp=3.17.0=pyhd8ed1ab_0
- zstd=1.5.6=hb46c0d2_0
Tried to query statistics on a recent pandas
release and got a traceback. Originally was using Pandas 2. So downgraded to Pandas 1 in case that was the issue, but the error still came up
condastats overall pandas --start_month 2024-03 --end_month 2024-04 --monthly
Traceback (most recent call last):
File "/Users/jkirkham/miniforge/envs/condastats/bin/condastats", line 10, in <module>
sys.exit(main())
^^^^^^
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/condastats/cli.py", line 387, in main
overall(
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/condastats/cli.py", line 88, in overall
df["pkg_name"] = df["pkg_name"].cat.remove_unused_categories()
^^^^^^^^^^^^^^^^^^
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/pandas/core/generic.py", line 5902, in __getattr__
return object.__getattribute__(self, name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/pandas/core/accessor.py", line 182, in __get__
accessor_obj = self._accessor(obj)
^^^^^^^^^^^^^^^^^^^
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/pandas/core/arrays/categorical.py", line 2849, in __init__
self._validate(data)
File "/Users/jkirkham/miniforge/envs/condastats/lib/python3.11/site-packages/pandas/core/arrays/categorical.py", line 2858, in _validate
raise AttributeError("Can only use .cat accessor with a 'category' dtype")
AttributeError: Can only use .cat accessor with a 'category' dtype. Did you mean: 'at'?
Note: There was new data uploaded recently ( ContinuumIO/anaconda-package-data#45 ). So it is possible this comes up due to the data structure itself
Getting the total/monthly download number from condastats
for a package (xeofs
) differs from the number you find on the corresponding anaconda website
condastats overall xeofs
The above command results in 1191, whereas the anaconda website states 4091. Where does the difference come from?
btw: Thanks for this super cool tool! :)
When making a request, I have the following PermissionError.
Traceback (most recent call last):
File "/home/.../venv/lib/python3.10/site-packages/s3fs/core.py", line 110, in _error_wrapper
return await func(*args, **kwargs)
File "/home/.../venv/lib/python3.10/site-packages/aiobotocore/client.py", line 265, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/.../venv/lib/python3.10/site-packages/condastats/cli.py", line 85, in overall
df = df.compute()
File "/home/.../venv/lib/python3.10/site-packages/dask/base.py", line 312, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/home/.../venv/lib/python3.10/site-packages/dask/base.py", line 600, in compute
results = schedule(dsk, keys, **kwargs)
File "/home/.../venv/lib/python3.10/site-packages/dask/threaded.py", line 81, in get
results = get_async(
File "/home/.../venv/lib/python3.10/site-packages/dask/local.py", line 508, in get_async
raise_exception(exc, tb)
File "/home/.../venv/lib/python3.10/site-packages/dask/local.py", line 316, in reraise
raise exc
File "/home/.../venv/lib/python3.10/site-packages/dask/local.py", line 221, in execute_task
result = _execute_task(task, data)
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/.../venv/lib/python3.10/site-packages/dask/optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 149, in get
result = _execute_task(task, cache)
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 113, in _execute_task
return [_execute_task(a, cache) for a in arg]
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 113, in <listcomp>
return [_execute_task(a, cache) for a in arg]
File "/home/.../venv/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/.../venv/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py", line 87, in __call__
return read_parquet_part(
File "/home/.../venv/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py", line 565, in read_parquet_part
dfs = [
File "/home/.../venv/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py", line 566, in <listcomp>
func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
File "/home/.../venv/lib/python3.10/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 930, in read_partition
parquet_file = ParquetFile(
File "/home/.../venv/lib/python3.10/site-packages/fastparquet/api.py", line 120, in __init__
basepath, fmd = metadata_from_many(fn, verify_schema=verify,
File "/home/.../venv/lib/python3.10/site-packages/fastparquet/util.py", line 181, in metadata_from_many
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
File "/home/.../venv/lib/python3.10/site-packages/fastparquet/util.py", line 181, in <listcomp>
pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
File "/home/.../venv/lib/python3.10/site-packages/fastparquet/api.py", line 139, in __init__
self._parse_header(f, verify)
File "/home/.../venv/lib/python3.10/site-packages/fastparquet/api.py", line 200, in _parse_header
head_size = struct.unpack('<I', f.read(4))[0]
File "/home/.../venv/lib/python3.10/site-packages/fsspec/spec.py", line 1578, in read
out = self.cache._fetch(self.loc, self.loc + length)
File "/home/.../venv/lib/python3.10/site-packages/fsspec/caching.py", line 377, in _fetch
self.cache = self.fetcher(start, bend)
File "/home/.../venv/lib/python3.10/site-packages/s3fs/core.py", line 2030, in _fetch_range
return _fetch_range(
File "/home/.../venv/lib/python3.10/site-packages/s3fs/core.py", line 2173, in _fetch_range
resp = fs.call_s3(
File "/home/.../venv/lib/python3.10/site-packages/fsspec/asyn.py", line 86, in wrapper
return sync(self.loop, func, *args, **kwargs)
File "/home/.../venv/lib/python3.10/site-packages/fsspec/asyn.py", line 66, in sync
raise return_result
File "/home/.../venv/lib/python3.10/site-packages/fsspec/asyn.py", line 26, in _runner
result[0] = await coro
File "/home/.../venv/lib/python3.10/site-packages/s3fs/core.py", line 332, in _call_s3
return await _error_wrapper(
File "/home/.../venv/lib/python3.10/site-packages/s3fs/core.py", line 137, in _error_wrapper
raise err
PermissionError: Access Denied
Any idea? Thank you in advance ๐
import condastats.cli
condastats.cli.overall('numpy')
Unable to use condastats.cli.overall (internal error on pandas->pyArrow)
dataconda = condastats.cli.overall([conda_module], monthly=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "[...]/lib/python3.11/site-packages/condastats/cli.py", line 62, in overall
df = dd.read_parquet(
^^^^^^^^^^^^^^^^
File "[...]/python3.11/site-packages/dask/backends.py", line 138, in wrapper
raise type(e)(
ValueError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: ArrowStringArray requires a PyArrow (chunked) array of string type
I've successfully used the condastats cli to get download data from common packages like pandas, but I can't seem to figure out how to download stats from a personal organization / channel. I'm trying to get stats for the conda package: https://anaconda.org/NREL/nrel-rev
These commands work successfully for me:
condastats overall pandas
condastats overall --data_source anaconda package pandas
But these commands all return an empty series:
condastats overall nrel-rev
condastats overall --data_source nrel package nrel-rev
condastats overall --data_source NREL package nrel-rev
Is there some secret incantation I'm missing?
I am opening a new issue for an old problem, as I cannot reopen the previous one.
Cheers,
Tom.
pkg_version uses all memory on system then kills Python
(base) belt@orion:~$ python3
Python 3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37)
[GCC 9.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from condastats.cli import overall, pkg_platform, pkg_version
>>> print(pkg_version("cutensor", monthly=True))
Killed
@sophiamyang Have you noticed that condastats isn't returning results after 05/2021?
I haven't had a chance to dig into yet.
(base) belt@orion:~/workStuff/nvdashboard/github_stats$ condastats overall --monthly cutensor
pkg_name time
cutensor 2021-01 1361
2021-02 4807
2021-03 10328
2021-04 8629
2021-05 12540
Name: counts, dtype: int64
(base) belt@orion:~/workStuff/nvdashboard/github_stats$ condastats overall --monthly pandas
pkg_name time
pandas 2017-01 253587
2017-02 287581
2017-03 305028
2017-04 374032
2017-05 421278
2017-06 386988
2017-07 416581
2017-08 472880
2017-09 419568
2017-10 497473
2017-11 491981
2017-12 502551
2018-01 638336
2018-02 563629
2018-03 552127
2018-04 460560
2018-05 585869
2018-06 634674
2018-07 743707
2018-08 832364
2018-09 763980
2018-10 905153
2018-11 977088
2018-12 822035
2019-01 932443
2019-02 1049595
2019-03 1268802
2019-04 1097222
2019-05 1333994
2019-06 1161773
2019-07 1023263
2019-08 1065872
2019-09 858497
2019-10 985868
2019-11 1076141
2019-12 1154434
2020-01 1386204
2020-02 1417490
2020-03 1576611
2020-04 1669577
2020-05 1627911
2020-06 1655576
2020-07 1587644
2020-08 1723479
2020-09 1785201
2020-10 1893043
2020-11 1644537
2020-12 1663101
2021-01 2151716
2021-02 2573241
2021-03 2835257
2021-04 2747648
2021-05 2391608
Name: counts, dtype: int64
The data for March and April of this year,2023, seems to be missing:
โ โ condastats overall cudf --start_month 2023-01 --data_source rapidsai --monthly
pkg_name time
cudf 2018-10 93
2018-11 1038
2018-12 807
2019-01 1503
2019-02 2702
2019-03 4586
2019-04 5245
2019-05 4576
2019-06 3470
2019-07 5546
2019-08 2825
2019-09 2244
2019-10 3330
2019-11 2669
2019-12 2129
2020-01 2329
2020-02 5502
2020-03 6005
2020-04 8149
2020-05 9698
2020-06 9637
2020-07 10062
2020-08 11617
2020-09 14342
2020-10 12957
2020-11 11489
2020-12 11205
2021-01 11267
2021-02 11824
2021-03 17856
2021-04 18740
2021-05 20166
2021-06 20394
2021-07 16088
2021-08 14745
2021-09 15327
2021-10 16039
2021-11 13019
2021-12 12711
2022-01 15999
2022-02 17021
2022-03 20025
2022-04 19299
2022-05 19367
2022-06 23601
2022-07 25680
2022-08 20749
2022-09 18496
2022-10 18093
2022-11 18324
2022-12 19657
2023-01 12923
2023-02 17163
Running
>>> import condastats.cli
>>> condastats.cli.overall('numpy')
I received
RuntimeError: Assigned array dtype (int16) cannot accommodate number of category labels (32833)
>>> import condastats.cli
>>> condastats.cli.overall('numpy')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/condastats/cli.py", line 85, in overall
df = df.compute()
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/base.py", line 312, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/base.py", line 600, in compute
results = schedule(dsk, keys, **kwargs)
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/threaded.py", line 81, in get
results = get_async(
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/local.py", line 508, in get_async
raise_exception(exc, tb)
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/local.py", line 316, in reraise
raise exc
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/local.py", line 221, in execute_task
result = _execute_task(task, data)
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 149, in get
result = _execute_task(task, cache)
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 113, in _execute_task
return [_execute_task(a, cache) for a in arg]
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 113, in <listcomp>
return [_execute_task(a, cache) for a in arg]
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 87, in __call__
return read_parquet_part(
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 565, in read_parquet_part
dfs = [
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 566, in <listcomp>
func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 1001, in read_partition
return cls.pf_to_pandas(
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 1093, in pf_to_pandas
pf.read_row_group_file(
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/fastparquet/api.py", line 361, in read_row_group_file
core.read_row_group(
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/fastparquet/core.py", line 608, in read_row_group
read_row_group_arrays(file, rg, columns, categories, schema_helper,
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/fastparquet/core.py", line 580, in read_row_group_arrays
read_col(column, schema_helper, file, use_cat=name+'-catdef' in out,
File "/home/zaikunzhang/.local/lib/python3.9/site-packages/fastparquet/core.py", line 476, in read_col
raise RuntimeError('Assigned array dtype (%s) cannot accommodate '
RuntimeError: Assigned array dtype (int16) cannot accommodate number of category labels (32833)
Thank you very much!
A declarative, efficient, and flexible JavaScript library for building user interfaces.
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. ๐๐๐
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google โค๏ธ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.