Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions examples/fastparquet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# fastparquet Examples

Each sub-directory contains a self-contained example. The order in
which the examples are to appear is specified in `order.json` (an
array of directory names in the expected order).

In each example directory you'll find:

* `config.toml` - must conform to the specification outlined here:
https://docs.pyscript.net/latest/user-guide/configuration/ This is
parsed and ultimately turned into a JSON representation as part of
the package's API object.
* `setup.py` - Python code for contextual and environmental setup,
NOT SEEN BY THE END USER, but is run before the `code.py` code is
evaluated. Allows us to create useful (IPython) shims, avoid
repeating boilerplate and whatnot.
* `code.py` - the actual code added to the editor which forms the
practical example of using the package.
55 changes: 55 additions & 0 deletions examples/fastparquet/column_selection_and_categories/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# ---------------------------------------------------------------------
# Reading just the columns you need, and decoding low-cardinality
# string columns as pandas Categoricals for memory savings.
# ---------------------------------------------------------------------

heading("Selective column reads")
note(
"Parquet's columnar layout means you can read just the columns "
"you need without scanning the rest. We'll write a wider dataset "
"and then load only two of its columns."
)

# A wider table: orders from a fictional online bookshop.
n_orders = 2000
genres = ["Fiction", "Non-fiction", "Poetry", "Science", "History"]
countries = ["UK", "US", "DE", "FR", "JP", "BR"]

orders = pd.DataFrame({
"order_id": np.arange(n_orders),
"genre": pd.Categorical(rng.choice(genres, size=n_orders), categories=genres),
"country": rng.choice(countries, size=n_orders),
"price": rng.uniform(5, 35, size=n_orders).round(2),
"quantity": rng.integers(1, 6, size=n_orders),
"discount": rng.uniform(0, 0.4, size=n_orders).round(3),
"shipping": rng.uniform(0, 8, size=n_orders).round(2),
})

write("/tmp/orders.parq", orders, compression="SNAPPY")
parquet_file = ParquetFile("/tmp/orders.parq")

note(f"All columns in the file: <code>{parquet_file.columns}</code>")

# Load only two columns, and treat 'genre' as a categorical.
slim = parquet_file.to_pandas(
columns=["genre", "price"],
categories=["genre"],
)

note("Loaded only <code>genre</code> and <code>price</code>:")
display(slim.head(), append=True)
note(
f"<code>genre</code> dtype is "
f"<strong>{slim['genre'].dtype}</strong> "
f"with categories <code>{list(slim['genre'].cat.categories)}</code>."
)

# A quick aggregate and bar chart from the slim frame.
avg_by_genre = slim.groupby("genre", observed=True)["price"].mean().sort_values()

fig, ax = plt.subplots(figsize=(8, 4))
avg_by_genre.plot(kind="barh", ax=ax, color="teal")
ax.set_title("Average book price by genre")
ax.set_xlabel("Average price ($)")
fig.tight_layout()
display(fig, append=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["fastparquet", "pandas", "numpy", "matplotlib"]
27 changes: 27 additions & 0 deletions examples/fastparquet/column_selection_and_categories/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Setup for cell 2: same names as cell 1, without the IPython shim."""
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastparquet import ParquetFile, write

rng = np.random.default_rng(7)
5 changes: 5 additions & 0 deletions examples/fastparquet/order.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
"write_and_read",
"column_selection_and_categories",
"row_groups_and_filters"
]
71 changes: 71 additions & 0 deletions examples/fastparquet/row_groups_and_filters/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# ---------------------------------------------------------------------
# Splitting a file into row groups and using filters to skip groups
# whose statistics rule them out. This is the foundation of fast
# analytical reads on large Parquet datasets.
# ---------------------------------------------------------------------

heading("Row groups: chunking a file for selective reads")
note(
"A row group is a horizontal slice of the table. Each group "
"stores per-column min/max statistics, so a reader can skip "
"entire groups that can't possibly match a filter."
)

# Half a million sensor readings, sorted by timestamp so row groups
# correspond to contiguous time windows.
n_readings = 50_000
sensors = pd.DataFrame({
"timestamp": pd.date_range("2026-01-01", periods=n_readings, freq="min"),
"sensor_id": rng.integers(0, 20, size=n_readings),
"value": rng.normal(100, 15, size=n_readings).round(3),
}).sort_values("timestamp").reset_index(drop=True)

# row_group_offsets carves the table into chunks at the given row
# indices. Here: ten groups of 5,000 rows each.
offsets = list(range(0, n_readings, 5_000))
write(
"/tmp/sensors.parq",
sensors,
row_group_offsets=offsets,
compression="SNAPPY",
)

parquet_file = ParquetFile("/tmp/sensors.parq")
note(
f"File has <strong>{len(parquet_file.row_groups)}</strong> row groups, "
f"<strong>{parquet_file.count()}</strong> rows total."
)

# Per-row-group time ranges from the file's statistics.
stats = parquet_file.statistics
group_ranges = pd.DataFrame({
"min_timestamp": pd.to_datetime(stats["min"]["timestamp"]),
"max_timestamp": pd.to_datetime(stats["max"]["timestamp"]),
"min_value": stats["min"]["value"],
"max_value": stats["max"]["value"],
})
note("First few row-group statistics:")
display(group_ranges.head().round(3), append=True)

# A filter is a list of (column, op, value) tuples. fastparquet uses
# row-group statistics to skip groups that can't match.
target_sensor = 7
filters = [
("sensor_id", "==", target_sensor),
("value", ">", 120.0),
]

filtered = parquet_file.to_pandas(filters=filters)
# Filters operate at row-group granularity, so apply them again
# row-wise for an exact result.
exact = filtered[
(filtered["sensor_id"] == target_sensor)
& (filtered["value"] > 120.0)
]

heading("Filtered result")
note(
f"Rows where <code>sensor_id == {target_sensor}</code> and "
f"<code>value &gt; 120</code>: <strong>{len(exact)}</strong>."
)
display(exact.head(), append=True)
1 change: 1 addition & 0 deletions examples/fastparquet/row_groups_and_filters/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["fastparquet", "pandas", "numpy"]
26 changes: 26 additions & 0 deletions examples/fastparquet/row_groups_and_filters/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Setup for cell 3: same names again, no IPython shim."""
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)


import numpy as np
import pandas as pd
from fastparquet import ParquetFile, write

rng = np.random.default_rng(7)
57 changes: 57 additions & 0 deletions examples/fastparquet/write_and_read/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
A first taste of fastparquet: write a DataFrame to a Parquet file
in the in-memory virtual filesystem, then read it back.

Parquet is a columnar storage format that's compact, fast, and
preserves dtypes. fastparquet is a pure-Python(+Cython) implementation
that integrates closely with pandas.

Docs: https://fastparquet.readthedocs.io
"""
from IPython.core.display import display, HTML
import numpy as np
import pandas as pd
from fastparquet import ParquetFile, write

rng = np.random.default_rng(7)


# A small synthetic dataset: temperature readings from weather stations.
n_rows = 500
stations = ["Reykjavik", "Lisbon", "Cairo", "Singapore", "Wellington"]

readings = pd.DataFrame({
"station": rng.choice(stations, size=n_rows),
"timestamp": pd.date_range("2026-01-01", periods=n_rows, freq="h"),
"temperature_c": rng.normal(15, 8, size=n_rows).round(2),
"humidity_pct": rng.uniform(30, 95, size=n_rows).round(1),
})

heading("1. Original DataFrame")
note("Five hundred hourly readings from five weather stations.")
display(readings.head(), append=True)

# Write to a Parquet file. Pyodide gives us a writable in-memory
# filesystem, so we can use a regular path.
path = "/tmp/weather.parq"
write(path, readings, compression="SNAPPY")
note(f"Wrote {len(readings)} rows to <code>{path}</code> with Snappy compression.")

# Read it back. ParquetFile gives us metadata first, without loading
# the whole file into memory.
parquet_file = ParquetFile(path)

heading("2. Parquet file metadata")
note(
f"Columns: <code>{parquet_file.columns}</code><br>"
f"Rows: <strong>{parquet_file.count()}</strong><br>"
f"Row groups: <strong>{len(parquet_file.row_groups)}</strong>"
)

heading("3. Round-tripped DataFrame")
roundtripped = parquet_file.to_pandas()
display(roundtripped.head(), append=True)
note(
f"Dtypes preserved? "
f"<strong>{(readings.dtypes == roundtripped.dtypes).all()}</strong>"
)
1 change: 1 addition & 0 deletions examples/fastparquet/write_and_read/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["fastparquet", "pandas", "numpy"]
41 changes: 41 additions & 0 deletions examples/fastparquet/write_and_read/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Shim IPython's display API onto PyScript so example code written in a
Jupyter/IPython idiom runs unmodified in the browser.
"""

import sys
import types
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
"""Wrap pyscript.display so output lands in the example target."""
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


ipython = types.ModuleType("IPython")
core = types.ModuleType("IPython.core")
core_display = types.ModuleType("IPython.core.display")
core_display.display = display
core_display.HTML = HTML
ipython.core = core
core.display = core_display
ipython.get_ipython = lambda: None
ipython.display = core_display
sys.modules["IPython"] = ipython
sys.modules["IPython.core"] = core
sys.modules["IPython.core.display"] = core_display
sys.modules["IPython.display"] = core_display


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)