统计绘图#

直方图#

使用 quad() 字形从 np.histogram 输出创建直方图

import numpy as np

from bokeh.plotting import figure, show

rng = np.random.default_rng()
x = rng.normal(loc=0, scale=1, size=1000)

p = figure(width=670, height=400, toolbar_location=None,
           title="Normal (Gaussian) Distribution")

# Histogram
bins = np.linspace(-3, 3, 40)
hist, edges = np.histogram(x, density=True, bins=bins)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
         fill_color="skyblue", line_color="white",
         legend_label="1000 random samples")

# Probability density function
x = np.linspace(-3.0, 3.0, 100)
pdf = np.exp(-0.5*x**2) / np.sqrt(2.0*np.pi)
p.line(x, pdf, line_width=2, line_color="navy",
       legend_label="Probability Density Function")

p.y_range.start = 0
p.xaxis.axis_label = "x"
p.yaxis.axis_label = "PDF(x)"

show(p)

人口金字塔图是一种发散的水平条形图,可用于比较两组之间的分布。在 Bokeh 中,可以使用 hbar() 字形创建它们。

import numpy as np

from bokeh.models import CustomJSTickFormatter, Label
from bokeh.palettes import DarkText, Vibrant3 as colors
from bokeh.plotting import figure, show
from bokeh.sampledata.titanic import data as df

sex_group = df.groupby("sex")

female_ages = sex_group.get_group("female")["age"].dropna()
male_ages = sex_group.get_group("male")["age"].dropna()

bin_width = 5
bins = np.arange(0, 72, bin_width)
m_hist, edges = np.histogram(male_ages, bins=bins)
f_hist, edges = np.histogram(female_ages, bins=bins)

p = figure(title="Age population pyramid of titanic passengers, by gender", height=400, width=600,
           x_range=(-90, 90), x_axis_label="count")

p.hbar(right=f_hist, y=edges[1:], height=bin_width*0.8, color=colors[0], line_width=0)

p.hbar(right=m_hist * -1, y=edges[1:], height=bin_width*0.8, color=colors[1], line_width=0)

# add text to every other bar
for i, (count, age) in enumerate(zip(f_hist, edges[1:])):
    if i % 2 == 1:
        continue
    p.text(x=count, y=edges[1:][i], text=[f"{age-bin_width}-{age}yrs"],
           x_offset=5, y_offset=7, text_font_size="12px", text_color=DarkText[5])

# customise x-axis and y-axis
p.xaxis.ticker = (-80, -60, -40, -20, 0, 20, 40, 60, 80)
p.xaxis.major_tick_out = 0
p.y_range.start = 3
p.ygrid.grid_line_color = None
p.yaxis.visible = False

# format tick labels as absolute values for the two-sided plot
p.xaxis.formatter = CustomJSTickFormatter(code="return Math.abs(tick);")

# add labels
p.add_layout(Label(x=-40, y=70, text="Men", text_color=colors[1], x_offset=5))
p.add_layout(Label(x=20, y=70, text="Women", text_color=colors[0], x_offset=5))

show(p)

箱线图#

箱线图可以使用 Whisker 注释、vbar()scatter() 字形组装

import pandas as pd

from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg2 import autompg2
from bokeh.transform import factor_cmap

df = autompg2[["class", "hwy"]].rename(columns={"class": "kind"})

kinds = df.kind.unique()

# compute quantiles
grouper = df.groupby("kind")
qs = grouper.hwy.quantile([0.25, 0.5, 0.75]).unstack().reset_index()
qs.columns = ["kind", "q1", "q2", "q3"]

# compute IQR outlier bounds
iqr = qs.q3 - qs.q1
qs["upper"] = qs.q3 + 1.5*iqr
qs["lower"] = qs.q1 - 1.5*iqr

# update the whiskers to actual data points
for kind, group in grouper:
    qs_idx = qs.query(f"kind=={kind!r}").index[0]
    data = group["hwy"]

    # the upper whisker is the maximum between p3 and upper
    q3 = qs.loc[qs_idx, "q3"]
    upper = qs.loc[qs_idx, "upper"]
    wiskhi = group[(q3 <= data) & (data <= upper)]["hwy"]
    qs.loc[qs_idx, "upper"] = q3 if len(wiskhi) == 0 else wiskhi.max()

    # the lower whisker is the minimum between q1 and lower
    q1 = qs.loc[qs_idx, "q1"]
    lower = qs.loc[qs_idx, "lower"]
    wisklo = group[(lower <= data) & (data<= q1)]["hwy"]
    qs.loc[qs_idx, "lower"] = q1 if len(wisklo) == 0 else wisklo.min()

df = pd.merge(df, qs, on="kind", how="left")

source = ColumnDataSource(qs)

p = figure(x_range=kinds, tools="", toolbar_location=None,
           title="Highway MPG distribution by vehicle class",
           background_fill_color="#eaefef", y_axis_label="MPG")

# outlier range
whisker = Whisker(base="kind", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)

# quantile boxes
cmap = factor_cmap("kind", "TolRainbow7", kinds)
p.vbar("kind", 0.7, "q2", "q3", source=source, color=cmap, line_color="black")
p.vbar("kind", 0.7, "q1", "q2", source=source, color=cmap, line_color="black")

# outliers
outliers = df[~df.hwy.between(df.lower, df.upper)]
p.scatter("kind", "hwy", source=outliers, size=6, color="black", alpha=0.3)

p.xgrid.grid_line_color = None
p.axis.major_label_text_font_size="14px"
p.axis.axis_label_text_font_size="12px"

show(p)

核密度估计#

import numpy as np
from scipy.stats import gaussian_kde

from bokeh.palettes import Blues9
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df


def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z

x, y, z = kde(df.hp, df.mpg, 300)

p = figure(height=400, x_axis_label="hp", y_axis_label="mpg",
           background_fill_color="#fafafa", tools="", toolbar_location=None,
           title="Kernel density estimation plot of HP vs MPG")
p.grid.level = "overlay"
p.grid.grid_line_color = "black"
p.grid.grid_line_alpha = 0.05

palette = Blues9[::-1]
levels = np.linspace(np.min(z), np.max(z), 10)
p.contour(x, y, z, levels[1:], fill_color=palette, line_color=palette)

show(p)

核密度估计也可以使用 varea() 字形绘制

import numpy as np
from sklearn.neighbors import KernelDensity

from bokeh.models import ColumnDataSource, Label, PrintfTickFormatter
from bokeh.palettes import Dark2_5 as colors
from bokeh.plotting import figure, show
from bokeh.sampledata.cows import data as df

breed_groups = df.groupby('breed')
x = np.linspace(2, 8, 1000)
source = ColumnDataSource(dict(x=x))

p = figure(title="Multiple density estimates", height=300, x_range=(2.5, 7.5), x_axis_label="butterfat contents", y_axis_label="density")

for (breed, breed_df), color in zip(breed_groups, colors):
    data = breed_df['butterfat'].values
    kde = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(data[:, np.newaxis])
    log_density = kde.score_samples(x[:, np.newaxis])
    y = np.exp(log_density)
    source.add(y, breed)
    p.varea(x="x", y1=breed, y2=0, source=source, fill_alpha=0.3, fill_color=color)

    # Find the highest point and annotate with a label
    max_idx = np.argmax(y)
    highest_point_label = Label(
        x=x[max_idx],
        y=y[max_idx],
        text=breed,
        text_font_size="10pt",
        x_offset=10,
        y_offset=-5,
        text_color=color,
    )
    p.add_layout(highest_point_label)

# Display x-axis labels as percentages
p.xaxis.formatter = PrintfTickFormatter(format="%d%%")

p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.minor_tick_line_color = None

p.xgrid.grid_line_color = None
p.yaxis.ticker = (0, 0.5, 1, 1.5)
p.y_range.start = 0

show(p)

SinaPlot#

SinaPlot 可以使用 harea()scatter() 字形组装

import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

from bokeh.plotting import figure, show
from bokeh.sampledata.lincoln import data as df

df["DATE"] = pd.to_datetime(df["DATE"])
df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
df["MONTH"] = df.DATE.dt.strftime("%b")

months = list(df.MONTH.unique())

p = figure(
    height=400,
    width=600,
    x_range=months,
    x_axis_label="month",
    y_axis_label="mean temperature (F)",
)

# add a non-uniform categorical offset to a given category
def offset(category, data, scale=7):
    return list(zip([category] * len(data), scale * data))


for month in months:
    month_df = df[df.MONTH == month].dropna()
    tavg = month_df.TAVG.values
    temps = np.linspace(tavg.min(), tavg.max(), 50)

    kde = KernelDensity(kernel="gaussian", bandwidth=3).fit(tavg[:, np.newaxis])
    density = np.exp(kde.score_samples(temps[:, np.newaxis]))
    x1, x2 = offset(month, density), offset(month, -density)

    p.harea(x1=x1, x2=x2, y=temps, alpha=0.8, color="#E0E0E0")

    # pre-compute jitter in Python, this case is too complex for BokehJS
    tavg_density = np.exp(kde.score_samples(tavg[:, np.newaxis]))
    jitter = (np.random.random(len(tavg)) * 2 - 1) * tavg_density

    p.scatter(x=offset(month, jitter), y=tavg, color="black")

p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None

show(p)

SPLOM#

SPLOM 是“散点图矩阵”,它以网格方式排列多个散点图,以突出显示维度之间的相关性。SPLOM 的关键组件是 链接平移链接刷选,如本例所示

from itertools import product

from bokeh.io import show
from bokeh.layouts import gridplot
from bokeh.models import (BasicTicker, ColumnDataSource, DataRange1d,
                          Grid, LassoSelectTool, LinearAxis, PanTool,
                          Plot, ResetTool, Scatter, WheelZoomTool)
from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap

df = data.copy()
df["body_mass_kg"] = df["body_mass_g"] / 1000

SPECIES = sorted(df.species.unique())
ATTRS = ("bill_length_mm", "bill_depth_mm", "body_mass_kg")
N = len(ATTRS)

source = ColumnDataSource(data=df)

xdrs = [DataRange1d(bounds=None) for _ in range(N)]
ydrs = [DataRange1d(bounds=None) for _ in range(N)]

plots = []

for i, (y, x) in enumerate(product(ATTRS, reversed(ATTRS))):
    p = Plot(x_range=xdrs[i%N], y_range=ydrs[i//N],
             background_fill_color="#fafafa",
             border_fill_color="white", width=200, height=200, min_border=5)

    if i % N == 0:  # first column
        p.min_border_left = p.min_border + 4
        p.width += 40
        yaxis = LinearAxis(axis_label=y)
        yaxis.major_label_orientation = "vertical"
        p.add_layout(yaxis, "left")
        yticker = yaxis.ticker
    else:
        yticker = BasicTicker()
    p.add_layout(Grid(dimension=1, ticker=yticker))

    if i >= N*(N-1):  # last row
        p.min_border_bottom = p.min_border + 40
        p.height += 40
        xaxis = LinearAxis(axis_label=x)
        p.add_layout(xaxis, "below")
        xticker = xaxis.ticker
    else:
        xticker = BasicTicker()
    p.add_layout(Grid(dimension=0, ticker=xticker))

    scatter = Scatter(x=x, y=y, fill_alpha=0.6, size=5, line_color=None,
                      fill_color=factor_cmap('species', 'Category10_3', SPECIES))
    r = p.add_glyph(source, scatter)
    p.x_range.renderers.append(r)
    p.y_range.renderers.append(r)

    # suppress the diagonal
    if (i%N) + (i//N) == N-1:
        r.visible = False
        p.grid.grid_line_color = None

    p.add_tools(PanTool(), WheelZoomTool(), ResetTool(), LassoSelectTool())

    plots.append(p)

show(gridplot(plots, ncols=N))