A Coding Tutorial on Datashader on Rendering Huge Datasets with Excessive-Efficiency Python Visible Analytics

import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                      "datashader", "colorcet", "numba", "scipy"])


import numpy  as np
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf
from datashader import reductions as rd
import colorcet as cc
import matplotlib.pyplot as plt
import matplotlib.colours as mcolors
from matplotlib.gridspec import GridSpec
from scipy.stats import multivariate_normal
import time, warnings
warnings.filterwarnings("ignore")


print("Datashader model:", ds.__version__)


def present(img, title="", ax=None, figsize=(6, 5)):
   standalone = ax is None
   if standalone:
       fig, ax = plt.subplots(figsize=figsize)
   rgba = img.to_pil()
   ax.imshow(rgba, origin="higher", side="auto")
   ax.set_title(title, fontsize=11, fontweight="daring")
   ax.axis("off")
   if standalone:
       plt.tight_layout()
       plt.present()


print("n=== SECTION 1: Core Pipeline ===")


rng = np.random.default_rng(42)
N   = 2_000_000


x = np.concatenate([rng.normal(-1, 0.5, N//3),
                   rng.normal( 1, 0.5, N//3),
                   rng.normal( 0, 1.5, N//3)])
y = np.concatenate([rng.normal(-1, 0.5, N//3),
                   rng.normal( 1, 0.5, N//3),
                   rng.normal( 0, 0.5, N//3)])
df_base = pd.DataFrame({"x": x, "y": y})


canvas = ds.Canvas(plot_width=600, plot_height=500,
                  x_range=(-4, 4), y_range=(-4, 4))


agg = canvas.factors(df_base, "x", "y", agg=rd.depend())


fig, axes = plt.subplots(1, 3, figsize=(15, 4))
combos = [
   ("Linear / blues",  tf.shade(agg, cmap=cc.blues,        how="linear")),
   ("Log    / fire",   tf.shade(agg, cmap=cc.fire,         how="log"   )),
   ("Eq-hist / bmy",   tf.shade(agg, cmap=cc.bmy,          how="eq_hist")),
]
for ax, (title, img) in zip(axes, combos):
   present(img, title, ax=ax)
plt.suptitle("Part 1 – 2 M factors: Linear vs Log vs Eq-Hist normalisation",
            fontsize=13, fontweight="daring")
plt.tight_layout()
plt.present()


print("n=== SECTION 2: Discount Sorts ===")


n_actual = len(df_base)
df_base["value"] = rng.exponential(scale=2, dimension=n_actual)
df_base["label"] = pd.Categorical(
   rng.selection(["A", "B", "C"], dimension=n_actual),
   classes=["A", "B", "C"]
)


canvas2 = ds.Canvas(plot_width=400, plot_height=350,
                   x_range=(-4, 4), y_range=(-4, 4))


reductions_cfg = [
   ("count()",          rd.count(),                 cc.kbc),
   ("sum(value)",       rd.sum("value"),             cc.CET_L3),
   ("mean(value)",      rd.mean("value"),            cc.CET_D4),
   ("std(value)",       rd.std("value"),             cc.CET_L16),
   ("min(value)",       rd.min("value"),             cc.CET_L17),
   ("max(value)",       rd.max("value"),             cc.bgyw),
   ("var(value)",       rd.var("value"),             cc.CET_L18),
   ("count_cat(label)", rd.count_cat("label"),       None),
]


fig, axes = plt.subplots(2, 4, figsize=(18, 9))
axes = axes.flat


for ax, (identify, agg_fn, cmap) in zip(axes, reductions_cfg):
   agg_r = canvas2.factors(df_base, "x", "y", agg=agg_fn)
   if cmap is None:
       img = tf.shade(agg_r, color_key={"A":"#e41a1c","B":"#377eb8","C":"#4daf4a"})
   else:
       img = tf.shade(agg_r, cmap=cmap, how="eq_hist")
   present(img, identify, ax=ax)


plt.suptitle("Part 2 – All Discount Sorts on 2 M factors", fontsize=14, fontweight="daring")
plt.tight_layout()
plt.present()


print("n=== SECTION 3: Categorical Visualisation ===")


N_cat = 500_000
classes = ["Cluster A", "Cluster B", "Cluster C", "Cluster D"]
facilities = [(-2, -2), (-2, 2), (2, -2), (2, 2)]
colours  = {"Cluster A":"#e41a1c","Cluster B":"#377eb8",
          "Cluster C":"#4daf4a","Cluster D":"#ff7f00"}


frames = []
for cat, (cx, cy) in zip(classes, facilities):
   n = N_cat // len(classes)
   frames.append(pd.DataFrame({
       "x":    rng.regular(cx, 0.8, n),
       "y":    rng.regular(cy, 0.8, n),
       "cat":  pd.Categorical([cat]*n, classes=classes),
   }))
df_cat = pd.concat(frames, ignore_index=True)


canvas3 = ds.Canvas(plot_width=500, plot_height=500,
                   x_range=(-5, 5), y_range=(-5, 5))
agg_cat = canvas3.factors(df_cat, "x", "y", agg=rd.count_cat("cat"))


fig, axes = plt.subplots(1, 3, figsize=(16, 5))


img_raw  = tf.shade(agg_cat, color_key=colours)
present(img_raw, "Uncooked (no unfold)", ax=axes[0])


img_sp1  = tf.unfold(tf.shade(agg_cat, color_key=colours), px=1)
present(img_sp1, "Unfold px=1", ax=axes[1])


img_bg   = tf.set_background(tf.shade(agg_cat, color_key=colours), shade="black")
present(img_bg, "Black background", ax=axes[2])


for cat, col in colours.objects():
   axes[2].plot([], [], "o", shade=col, label=cat, markersize=8)
axes[2].legend(loc="decrease proper", fontsize=8, framealpha=0.6)


plt.suptitle("Part 3 – Categorical Rendering (500 ok factors)", fontsize=13, fontweight="daring")
plt.tight_layout()
plt.present()

Source link

A Coding Tutorial on Datashader on Rendering Huge Datasets with Excessive-Efficiency Python Visible Analytics

OpenAI says hackers stole some information after newest code safety concern

Cerebras raises $5.5B, kicking off 2026’s IPO season with a bang

Khosla Ventures is betting $10M on Ian Crosby, whose final startup, Bench, imploded

A Coding Tutorial on Datashader on Rendering Huge Datasets with Excessive-Efficiency Python Visible Analytics

Related Posts

OpenAI says hackers stole some information after newest code safety concern

Cerebras raises $5.5B, kicking off 2026’s IPO season with a bang

Khosla Ventures is betting $10M on Ian Crosby, whose final startup, Bench, imploded