(original) (raw)
From 75830e23b631d7ad8d5d15242bb2398ba98a9f5d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 02:37:18 -0800 Subject: [PATCH 01/16] Add the NormalDist class to statistics --- Lib/statistics.py | 198 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 197 insertions(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8ecb906d869951..8acd06e42b9d41 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -76,7 +76,7 @@ """ -__all__ = [ 'StatisticsError', +__all__ = [ 'StatisticsError', 'NormalDist', 'pstdev', 'pvariance', 'stdev', 'variance', 'median', 'median_low', 'median_high', 'median_grouped', 'mean', 'mode', 'harmonic_mean', 'fmean', @@ -85,11 +85,13 @@ import collections import math import numbers +import random from fractions import Fraction from decimal import Decimal from itertools import groupby from bisect import bisect_left, bisect_right +from math import hypot, sqrt, fabs, exp, erf, tau @@ -694,3 +696,197 @@ def pstdev(data, mu=None): return var.sqrt() except AttributeError: return math.sqrt(var) + +## Normal Distribution ##################################################### + +fmean = lambda s: mean(map(float, s)) # XXX take this out when fmean() lands + +class NormalDist: + 'Normal distribution of a random variable' + # https://en.wikipedia.org/wiki/Normal\_distribution + # https://en.wikipedia.org/wiki/Variance#Properties + + __slots__ = ('mu', 'sigma') + + def __init__(self, mu, sigma=0.0): + 'NormalDist where mu is the mean and sigma is the standard deviation' + if sigma < 0.0: + raise StatisticsError('sigma must be non-negative') + self.mu = mu + self.sigma = sigma + + @classmethod + def from_samples(cls, data): + 'Make a normal distribution instance from sample data' + if not isinstance(data, (list, tuple)): + data = list(data) + xbar = fmean(data) + return cls(xbar, stdev(data, xbar)) + + def samples(self, n, seed=None): + 'Generate *n* samples for a given mean and standard deviation' + gauss = random.gauss if seed is None else random.Random(seed).gauss + mu, sigma = self.mu, self.sigma + return [gauss(mu, sigma) for i in range(n)] + + def pdf(self, x): + 'Probability density function: P(x <= X < x+dx) / dx' + variance = self.sigma ** 2.0 + if not variance: + raise StatisticsError('pdf() not defined when sigma is zero') + return exp((x - self.mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance) + + def cdf(self, x): + 'Cumulative density function: P(X <= x)' + if not self.sigma: + raise StatisticsError('cdf() not defined when sigma is zero') + return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0)))) + + @property + def variance(self): + 'Square of the standard deviation' + return self.sigma ** 2.0 + + def __repr__(self): + return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' + + def __add__(x1, x2): + if isinstance(x2, NormalDist): + return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma)) + return NormalDist(x1.mu + x2, x1.sigma) + + def __sub__(x1, x2): + if isinstance(x2, NormalDist): + return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma)) + return NormalDist(x1.mu - x2, x1.sigma) + + def __mul__(x1, x2): + return NormalDist(x1.mu * x2, x1.sigma * fabs(x2)) + + def __truediv__(x1, x2): + return NormalDist(x1.mu / x2, x1.sigma / fabs(x2)) + + def __pos__(x1): + return x1 + + def __neg__(x1): + return NormalDist(-x1.mu, x1.sigma) + + __radd__ = __add__ + + def __rsub__(x1, x2): + return -(x1 - x2) + + __rmul__ = __mul__ + + +if __name__ == '__main__': + + from math import isclose + from operator import add, sub, mul, truediv + from itertools import repeat + + g1 = NormalDist(10, 20) + g2 = NormalDist(-5, 25) + + # Test scaling by a constant + assert (g1 * 5 / 5).mu == g1.mu + assert (g1 * 5 / 5).sigma == g1.sigma + + n = 100_000 + G1 = g1.samples(n) + G2 = g2.samples(n) + + for func in (add, sub): + print(f'\nTest {func.__name__} with another NormalDist:') + print(func(g1, g2)) + print(NormalDist.from_samples(map(func, G1, G2))) + + const = 11 + for func in (add, sub, mul, truediv): + print(f'\nTest {func.__name__} with a constant:') + print(func(g1, const)) + print(NormalDist.from_samples(map(func, G1, repeat(const)))) + + const = 19 + for func in (add, sub, mul): + print(f'\nTest constant with {func.__name__}:') + print(func(const, g1)) + print(NormalDist.from_samples(map(func, repeat(const), G1))) + + def assert_close(G1, G2): + assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2) + assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2) + + X = NormalDist(-105, 73) + Y = NormalDist(31, 47) + s = 32.75 + n = 100_000 + + S = NormalDist.from_samples([x + s for x in X.samples(n)]) + assert_close(X + s, S) + + S = NormalDist.from_samples([x - s for x in X.samples(n)]) + assert_close(X - s, S) + + S = NormalDist.from_samples([x * s for x in X.samples(n)]) + assert_close(X * s, S) + + S = NormalDist.from_samples([x / s for x in X.samples(n)]) + assert_close(X / s, S) + + S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n), + Y.samples(n))]) + assert_close(X + Y, S) + + S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n), + Y.samples(n))]) + assert_close(X - Y, S) + + + ######### Examples ##################################################### + + # Simple scaling and translation + temperature_february = NormalDist(5, 2.5) # Celsius + print(temperature_february * (9/5) + 32) # Fahrenheit + + + # Classic probability problems + # https://blog.prepscholar.com/sat-standard-deviation + # The mean score on a SAT exam is 1060 with a standard deviation of 195 + # What percentage of students score between 1100 and 1200? + sat = NormalDist(1060, 195) + fraction = sat.cdf(1200) - sat.cdf(1100) + print(f'{fraction * 100 :.1f}% score between 1100 and 1200') + + + # Combination of normal distributions by summing variances + birth_weights = NormalDist.from_samples([2.5, 3.1, 2.1, 2.4, 2.7, 3.5]) + drug_effects = NormalDist(0.4, 0.15) + print(birth_weights + drug_effects) + + + # Statistical calculation estimates using simulations + # Estimate the distribution of X * Y / Z + n = 100_000 + X = NormalDist(350, 15).samples(n) + Y = NormalDist(47, 17).samples(n) + Z = NormalDist(62, 6).samples(n) + print(NormalDist.from_samples(x * y / z for x, y, z in zip(X, Y, Z))) + + + # Naive Bayesian Classifier + # https://en.wikipedia.org/wiki/Naive\_Bayes\_classifier#Sex\_classification + + height_male = NormalDist.from_samples([6, 5.92, 5.58, 5.92]) + height_female = NormalDist.from_samples([5, 5.5, 5.42, 5.75]) + weight_male = NormalDist.from_samples([180, 190, 170, 165]) + weight_female = NormalDist.from_samples([100, 150, 130, 150]) + foot_size_male = NormalDist.from_samples([12, 11, 12, 10]) + foot_size_female = NormalDist.from_samples([6, 8, 7, 9]) + + prior_male = 0.5 + prior_female = 0.5 + posterior_male = prior_male * height_male.pdf(6) * weight_male.pdf(130) * foot_size_male.pdf(8) + posterior_female = prior_female * height_female.pdf(6) * weight_female.pdf(130) * foot_size_female.pdf(8) + print('Predict', 'male' if posterior_male > posterior_female else 'female') From 297e90bbf9697659f8312c91684d8528b57a6233 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 13:06:55 -0800 Subject: [PATCH 02/16] Make immutable, comparable, and hashable --- Lib/statistics.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8acd06e42b9d41..0b6d95119d7ba3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -747,9 +747,6 @@ def variance(self): 'Square of the standard deviation' return self.sigma ** 2.0 - def __repr__(self): - return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' - def __add__(x1, x2): if isinstance(x2, NormalDist): return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma)) @@ -779,6 +776,20 @@ def __rsub__(x1, x2): __rmul__ = __mul__ + def __setattr__(self, attr, value): + if attr not in ('mu', 'sigma'): + raise AttributeError(f"can't set attribute {attr}") + super().__setattr__(attr, value) + + def __eq__(x1, x2): + return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) + + def __hash__(x1): + return hash((x1.mu, x2.sigma)) + + def __repr__(self): + return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' + if __name__ == '__main__': From 8ae88c8f42534a339341a8a4f060191d94513e64 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 15:34:37 -0800 Subject: [PATCH 03/16] Fix-up the immutability/hashability/equality logic to match what is done in dataclasses --- Lib/statistics.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 0b6d95119d7ba3..2e3d8ac336f253 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -699,8 +699,6 @@ def pstdev(data, mu=None): ## Normal Distribution ##################################################### -fmean = lambda s: mean(map(float, s)) # XXX take this out when fmean() lands - class NormalDist: 'Normal distribution of a random variable' # https://en.wikipedia.org/wiki/Normal\_distribution @@ -712,8 +710,8 @@ def __init__(self, mu, sigma=0.0): 'NormalDist where mu is the mean and sigma is the standard deviation' if sigma < 0.0: raise StatisticsError('sigma must be non-negative') - self.mu = mu - self.sigma = sigma + object.__setattr__(self, 'mu', mu) + object.__setattr__(self, 'sigma', sigma) @classmethod def from_samples(cls, data): @@ -776,16 +774,18 @@ def __rsub__(x1, x2): __rmul__ = __mul__ - def __setattr__(self, attr, value): - if attr not in ('mu', 'sigma'): - raise AttributeError(f"can't set attribute {attr}") - super().__setattr__(attr, value) - def __eq__(x1, x2): - return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) + if x1.__class__ is x2.__class__: + return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) + return NotImplemented def __hash__(x1): - return hash((x1.mu, x2.sigma)) + return hash((x1.mu, x1.sigma)) + + def __setattr__(self, attr, value): + if type(self) is NormalDist or attr in ('mu', 'sigma'): + raise AttributeError(f"cannot set attribute {attr!r}") + super().__setattr__(attr, value) def __repr__(self): return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' From 69715cdd2b4237e14462cc0d682eae4050707efe Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 15:47:10 -0800 Subject: [PATCH 04/16] Add NEWS entry --- .../next/Library/2019-02-21-15-47-00.bpo-36018.qt7QUe.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2019-02-21-15-47-00.bpo-36018.qt7QUe.rst diff --git a/Misc/NEWS.d/next/Library/2019-02-21-15-47-00.bpo-36018.qt7QUe.rst b/Misc/NEWS.d/next/Library/2019-02-21-15-47-00.bpo-36018.qt7QUe.rst new file mode 100644 index 00000000000000..bba47f4ea91e30 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-02-21-15-47-00.bpo-36018.qt7QUe.rst @@ -0,0 +1,3 @@ +Add statistics.NormalDist, a tool for creating and manipulating normal +distributions of random variable. Features a composite class that treats +the mean and standard deviation of measurement data as single entity. From f444855bd23b475963856f924966b00421b12eb7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 18:22:54 -0800 Subject: [PATCH 05/16] First pass at documentation --- Doc/library/statistics.rst | 103 +++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 20a2c1cb13e115..afe1afa62a7290 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -467,6 +467,109 @@ A single exception is defined: Subclass of :exc:`ValueError` for statistics-related exceptions. + +:class:`NormalDist` objects +=========================== + +A :class:`NormalDist` is a a composite class that treats the mean and standard +deviation of data measurements as single entity. It is tool for creating and +manipulating normal distributions of a random variable. + +Normal distributions arise from the `Central Limit Theorem +`_ and have a wide range +of applications in statistics, including hypothesis testing. + +.. class:: NormalDist(mu, sigma=0.0) + + Returns a new *NormalDist* object where *mu* represents the `arithmetic + mean `_ of data and *sigma* + represented the `standard deviation + `_ of the data. + + If *sigma* is negative, raises :exc:`StatisticsError`. + + .. attribute:: mu + + A read-only attribute representing the mean of the normal distribution. + + .. attribute:: sigma + + A read-only attribute representing the standard deviation of the + normal distribution. + + .. attribute:: variance + + A read-only property representing the `variance + `_ of the normal + distribution. Equal to the square of the standard deviation. + + .. classmethod:: NormalDist.from_samples(data) + + Class method that makes an normal distribution instance + from sample data. The *data* can be any :term:`iterable` + and should consist of values that can be converted to type + :class:`float` values. + + If *data* does not contain at least two elements, raises + :exc:`StatisticsError` because it takes at least one point to estimate + a central value and at least two points to estimate dispersion. + + .. method:: NormalDist.samples(n, seed=None) + + Generates *n* random samples for a given mean and standard deviation. + Returns a :class:`list` of :class:`float`. + + If *seed* is given, creates a new instance of the underlying random + number generator. This is useful for creating reproducible results, + even in a multi-threading context. + + .. method:: NormalDist.pdf(x) + + Using a `probability density function (pdf) + `_, + compute the relative likelihood that a random sample *X* will be near + the given value *x*. Mathematically, it is the ratio ``P(x <= X < + x+dx) / dx``. + + Note, the relative likelihood of *x* can be greater than `1.0`. The + probability of a specific point on a continuous distribution is `0.0`, + so the :func:`pdf` is used instead. It gives the probability of a + sample in a narrow range around *x* and then dividing that probability + by the width of the range (hence the word "density"). + + .. method:: NormalDist.cdf(x) + + Using a `cumulative distribution function (cdf) + `_, + compute probability that a random sample *X* will be less than or equal + to *x*. Mathematically, it is written ``P(X <= x)``. + + Instances of :class:`NormalDist` support addition, subtraction, + multiplication and division by a constant. These operations + are used for translation and scaling. For example:: + + >>> temperature_february = NormalDist(5, 2.5) # Celsius + >>> temperature_february * (9/5) + 32 # Fahrenheit + NormalDist(mu=41.0, sigma=4.5) + + Dividing a constant by an instance of :class:`NormalDist` is not supported. + + Since normal distributions arise from additive effects of independent + variables, it is possible to `add and subtract two normally distributed + random variables + `_ + represented as instances of :class:`NormalDist`. For example:: + + >>> birth_weights = NormalDist.from_samples([2.5, 3.1, 2.1, 2.4, 2.7, 3.5]) + >>> drug_effects = NormalDist(0.4, 0.15) + >>> combined = birth_weights + drug_effects + >>> f'mu={combined.mu :.1f} sigma={combined.sigma :.1f}' + 'mu=3.1 sigma=0.5' + + Instances of :class:`NormalDist` are :term:`immutable` and + :term:`hashable`. + + .. # This modelines must appear within the last ten lines of the file. kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8; From 93b2607d7961bed54b68bdc8d3c0f5bd31d3a434 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 20:06:25 -0800 Subject: [PATCH 06/16] Move examples into the main docs --- Doc/library/statistics.rst | 74 ++++++++++++++++++++++++++++++++++++++ Lib/statistics.py | 51 ++------------------------ 2 files changed, 77 insertions(+), 48 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index afe1afa62a7290..728967b234a00f 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -569,6 +569,80 @@ of applications in statistics, including hypothesis testing. Instances of :class:`NormalDist` are :term:`immutable` and :term:`hashable`. +:class:`NormalDist` Examples and Recipes +---------------------------------------- + +A :class:`NormalDist` readily solves classic probability problems. + +For example, given `historical data for SAT exams +`_ showing that scores +are normally distributed with a mean of 1060 and standard deviation of 192, +determine the percentage of students with scores between 1100 and 1200:: + + >>> sat = NormalDist(1060, 195) + >>> fraction = sat.cdf(1200) - sat.cdf(1100) + >>> f'{fraction * 100 :.1f}% score between 1100 and 1200' + '18.2% score between 1100 and 1200 + +To estimate the distribution for a model than isn't easy to solve +analytically, :class:`NormalDist` can generate input samples for a `Monte +Carlo simulation `_ of a +complex model:: + + >>> n = 100_000 + >>> X = NormalDist(350, 15).samples(n) + >>> Y = NormalDist(47, 17).samples(n) + >>> Z = NormalDist(62, 6).samples(n) + >>> model_simulation = [x * y / z for x, y, z in zip(X, Y, Z)] + >>> NormalDist.from_samples(model_simulation) + NormalDist(mu=267.6516398754636, sigma=101.357284306067) + +Normal distributions commonly arise in machine learning problems. + +Wikipedia has a `nice example with Naive Bayesian Classifier +`_. The challenge +is guess a person's gender from measurements of normally distributed +features including height, weight, and foot size. + +The `prior probability `_ of +male or female is 50%:: + + >>> prior_male = 0.5 + >>> prior_female = 0.5 + +We also have a training dataset with measurements for eight people. These +measurements are assumed to be normally distributed, so we summarize the data +with :class:`NormalDist`:: + + >>> height_male = NormalDist.from_samples([6, 5.92, 5.58, 5.92]) + >>> height_female = NormalDist.from_samples([5, 5.5, 5.42, 5.75]) + >>> weight_male = NormalDist.from_samples([180, 190, 170, 165]) + >>> weight_female = NormalDist.from_samples([100, 150, 130, 150]) + >>> foot_size_male = NormalDist.from_samples([12, 11, 12, 10]) + >>> foot_size_female = NormalDist.from_samples([6, 8, 7, 9]) + +We observe a new person whose feature measurements are known but whose gender +is unknown:: + + >>> h = 6.0 + >>> w = 130 + >>> f = 8 + +The posterior is the product of the prior times each the likelihoods of the +gender given a feature measurement:: + + >>> posterior_male = (prior_male * height_male.pdf(h) * + weight_male.pdf(w) * foot_size_male.pdf(f)) + >>> posterior_female = (prior_female * height_female.pdf(6) * + weight_female.pdf(130) * foot_size_female.pdf(8)) + +The final prediction is given to the largest posterior (this is known as the +`maximum a posterior +`_ (MAP):: + + >>> 'male' if posterior_male > posterior_female else 'female' + 'female' + .. # This modelines must appear within the last ten lines of the file. diff --git a/Lib/statistics.py b/Lib/statistics.py index 2e3d8ac336f253..e1039115a22751 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -793,6 +793,9 @@ def __repr__(self): if __name__ == '__main__': + # Show math operations computed analytically in comparsion + # to a monte carlo simulation of the same operations + from math import isclose from operator import add, sub, mul, truediv from itertools import repeat @@ -853,51 +856,3 @@ def assert_close(G1, G2): S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n), Y.samples(n))]) assert_close(X - Y, S) - - - ######### Examples ##################################################### - - # Simple scaling and translation - temperature_february = NormalDist(5, 2.5) # Celsius - print(temperature_february * (9/5) + 32) # Fahrenheit - - - # Classic probability problems - # https://blog.prepscholar.com/sat-standard-deviation - # The mean score on a SAT exam is 1060 with a standard deviation of 195 - # What percentage of students score between 1100 and 1200? - sat = NormalDist(1060, 195) - fraction = sat.cdf(1200) - sat.cdf(1100) - print(f'{fraction * 100 :.1f}% score between 1100 and 1200') - - - # Combination of normal distributions by summing variances - birth_weights = NormalDist.from_samples([2.5, 3.1, 2.1, 2.4, 2.7, 3.5]) - drug_effects = NormalDist(0.4, 0.15) - print(birth_weights + drug_effects) - - - # Statistical calculation estimates using simulations - # Estimate the distribution of X * Y / Z - n = 100_000 - X = NormalDist(350, 15).samples(n) - Y = NormalDist(47, 17).samples(n) - Z = NormalDist(62, 6).samples(n) - print(NormalDist.from_samples(x * y / z for x, y, z in zip(X, Y, Z))) - - - # Naive Bayesian Classifier - # https://en.wikipedia.org/wiki/Naive\_Bayes\_classifier#Sex\_classification - - height_male = NormalDist.from_samples([6, 5.92, 5.58, 5.92]) - height_female = NormalDist.from_samples([5, 5.5, 5.42, 5.75]) - weight_male = NormalDist.from_samples([180, 190, 170, 165]) - weight_female = NormalDist.from_samples([100, 150, 130, 150]) - foot_size_male = NormalDist.from_samples([12, 11, 12, 10]) - foot_size_female = NormalDist.from_samples([6, 8, 7, 9]) - - prior_male = 0.5 - prior_female = 0.5 - posterior_male = prior_male * height_male.pdf(6) * weight_male.pdf(130) * foot_size_male.pdf(8) - posterior_female = prior_female * height_female.pdf(6) * weight_female.pdf(130) * foot_size_female.pdf(8) - print('Predict', 'male' if posterior_male > posterior_female else 'female') From 5fa0d27d965f3fa1904dd3bc4aa739300c98288f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 20:37:55 -0800 Subject: [PATCH 07/16] More proofreading --- Doc/library/statistics.rst | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 728967b234a00f..11177772f98a15 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -472,40 +472,40 @@ A single exception is defined: =========================== A :class:`NormalDist` is a a composite class that treats the mean and standard -deviation of data measurements as single entity. It is tool for creating and -manipulating normal distributions of a random variable. +deviation of data measurements as a single entity. It is a tool for creating +and manipulating normal distributions of a random variable. Normal distributions arise from the `Central Limit Theorem`_ and have a wide range -of applications in statistics, including hypothesis testing. +of applications in statistics, including simulations and hypothesis testing. .. class:: NormalDist(mu, sigma=0.0) Returns a new *NormalDist* object where *mu* represents the `arithmetic mean `_ of data and *sigma* - represented the `standard deviation + represents the `standard deviation`_ of the data. If *sigma* is negative, raises :exc:`StatisticsError`. .. attribute:: mu - A read-only attribute representing the mean of the normal distribution. + A read-only attribute for the mean of a normal distribution. .. attribute:: sigma - A read-only attribute representing the standard deviation of the - normal distribution. + A read-only attribute for the standard deviation of a normal + distribution. .. attribute:: variance A read-only property representing the `variance - `_ of the normal + `_ of a normal distribution. Equal to the square of the standard deviation. .. classmethod:: NormalDist.from_samples(data) - Class method that makes an normal distribution instance + Class method that makes a normal distribution instance from sample data. The *data* can be any :term:`iterable` and should consist of values that can be converted to type :class:`float` values. @@ -517,7 +517,7 @@ of applications in statistics, including hypothesis testing. .. method:: NormalDist.samples(n, seed=None) Generates *n* random samples for a given mean and standard deviation. - Returns a :class:`list` of :class:`float`. + Returns a :class:`list` of :class:`float` values. If *seed* is given, creates a new instance of the underlying random number generator. This is useful for creating reproducible results, @@ -531,18 +531,18 @@ of applications in statistics, including hypothesis testing. the given value *x*. Mathematically, it is the ratio ``P(x <= X < x+dx) / dx``. - Note, the relative likelihood of *x* can be greater than `1.0`. The - probability of a specific point on a continuous distribution is `0.0`, + Note the relative likelihood of *x* can be greater than `1.0`. The + probability for a specific point on a continuous distribution is `0.0`, so the :func:`pdf` is used instead. It gives the probability of a - sample in a narrow range around *x* and then dividing that probability - by the width of the range (hence the word "density"). + sample occurring in a narrow range around *x* and then dividing that + probability by the width of the range (hence the word "density"). .. method:: NormalDist.cdf(x) Using a `cumulative distribution function (cdf)`_, - compute probability that a random sample *X* will be less than or equal - to *x*. Mathematically, it is written ``P(X <= x)``. + compute the probability that a random sample *X* will be less than or + equal to *x*. Mathematically, it is written ``P(X <= x)``. Instances of :class:`NormalDist` support addition, subtraction, multiplication and division by a constant. These operations @@ -586,8 +586,8 @@ determine the percentage of students with scores between 1100 and 1200:: To estimate the distribution for a model than isn't easy to solve analytically, :class:`NormalDist` can generate input samples for a `Monte -Carlo simulation `_ of a -complex model:: +Carlo simulation `_ of the +model:: >>> n = 100_000 >>> X = NormalDist(350, 15).samples(n) @@ -599,9 +599,9 @@ complex model:: Normal distributions commonly arise in machine learning problems. -Wikipedia has a `nice example with Naive Bayesian Classifier +Wikipedia has a `nice example with a Naive Bayesian Classifier`_. The challenge -is guess a person's gender from measurements of normally distributed +is to guess a person's gender from measurements of normally distributed features including height, weight, and foot size. The `prior probability `_ of @@ -636,9 +636,9 @@ gender given a feature measurement:: >>> posterior_female = (prior_female * height_female.pdf(6) * weight_female.pdf(130) * foot_size_female.pdf(8)) -The final prediction is given to the largest posterior (this is known as the +The final prediction is given to the largest posterior -- this is known as the `maximum a posterior -`_ (MAP):: +`_ or MAP:: >>> 'male' if posterior_male > posterior_female else 'female' 'female' From b1d1d9d64b3ba851b1cc1b7f10226ecf06d7ba02 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 23:28:19 -0800 Subject: [PATCH 08/16] Add tests --- Lib/test/test_statistics.py | 166 ++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index e35144677ad53e..4f6650ec899a90 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2025,6 +2025,172 @@ def test_compare_to_variance(self): expected = math.sqrt(statistics.variance(data)) self.assertEqual(self.func(data), expected) +class TestNormalDist(unittest.TestCase): + + def test_slots(self): + nd = statistics.NormalDist(300, 23) + with self.assertRaises(TypeError): + vars(nd) + self.assertEqual(nd.__slots__, ('mu', 'sigma')) + + def test_instantiation_and_attributes(self): + nd = statistics.NormalDist(500, 17) + self.assertEqual(nd.mu, 500) + self.assertEqual(nd.sigma, 17) + self.assertEqual(nd.variance, 17**2) + + # default argument + nd = statistics.NormalDist(400) + self.assertEqual(nd.mu, 400) + self.assertEqual(nd.sigma, 0) + self.assertEqual(nd.variance, 0**2) + + # error case: negative sigma + with self.assertRaises(statistics.StatisticsError): + statistics.NormalDist(500, -10) + + def test_alternative_constructor(self): + NormalDist = statistics.NormalDist + data = [96, 107, 90, 92, 110] + # list input + self.assertEqual(NormalDist.from_samples(data), NormalDist(99, 9)) + # tuple input + self.assertEqual(NormalDist.from_samples(tuple(data)), NormalDist(99, 9)) + # iterator input + self.assertEqual(NormalDist.from_samples(iter(data)), NormalDist(99, 9)) + # error cases + with self.assertRaises(statistics.StatisticsError): + NormalDist.from_samples([]) # empty input + with self.assertRaises(statistics.StatisticsError): + NormalDist.from_samples([10]) # only one input + + def test_sample_generation(self): + NormalDist = statistics.NormalDist + mu, sigma = 10_000, 3.0 + X = NormalDist(mu, sigma) + n = 1_000 + data = X.samples(n) + self.assertEqual(len(data), n) + self.assertEqual(set(map(type, data)), {float}) + # mean(data) expected to fall within 8 standard deviations + xbar = statistics.mean(data) + self.assertTrue(mu - sigma*8 <= xbar <= mu + sigma*8) + + # verify that seeding makes reproducible sequences + n = 100 + data1 = X.samples(n, seed='happiness and joy') + data2 = X.samples(n, seed='trouble and despair') + data3 = X.samples(n, seed='happiness and joy') + data4 = X.samples(n, seed='trouble and despair') + self.assertEqual(data1, data3) + self.assertEqual(data2, data4) + self.assertNotEqual(data1, data2) + + # verify that subclass type is honored + class NewNormalDist(NormalDist): + pass + nnd = NewNormalDist(200, 5) + self.assertEqual(type(nnd), NewNormalDist) + + def test_pdf(self): + NormalDist = statistics.NormalDist + X = NormalDist(100, 15) + # Verify peak around center + self.assertLess(X.pdf(99), X.pdf(100)) + self.assertLess(X.pdf(101), X.pdf(100)) + # Test symmetry + self.assertAlmostEqual(X.pdf(99), X.pdf(101)) + self.assertAlmostEqual(X.pdf(98), X.pdf(102)) + self.assertAlmostEqual(X.pdf(97), X.pdf(103)) + # Test vs CDF + dx = 2.0 ** -10 + for x in range(90, 111): + est_pdf = (X.cdf(x + dx) - X.cdf(x)) / dx + self.assertAlmostEqual(X.pdf(x), est_pdf, places=4) + # Error case: variance is zero + Y = NormalDist(100, 0) + with self.assertRaises(statistics.StatisticsError): + Y.pdf(90) + + def test_cdf(self): + NormalDist = statistics.NormalDist + X = NormalDist(100, 15) + cdfs = [X.cdf(x) for x in range(1, 200)] + self.assertEqual(set(map(type, cdfs)), {float}) + # Verify montonic + self.assertEqual(cdfs, sorted(cdfs)) + # Verify center + self.assertAlmostEqual(X.cdf(100), 0.50) + # Error case: variance is zero + Y = NormalDist(100, 0) + with self.assertRaises(statistics.StatisticsError): + Y.cdf(90) + + def test_same_type_addition_and_subtraction(self): + NormalDist = statistics.NormalDist + X = NormalDist(100, 12) + Y = NormalDist(40, 5) + self.assertEqual(X + Y, NormalDist(140, 13)) # __add__ + self.assertEqual(X - Y, NormalDist(60, 13)) # __sub__ + + def test_translation_and_scaling(self): + NormalDist = statistics.NormalDist + X = NormalDist(100, 15) + y = 10 + self.assertEqual(+X, NormalDist(100, 15)) # __pos__ + self.assertEqual(-X, NormalDist(-100, 15)) # __neg__ + self.assertEqual(X + y, NormalDist(110, 15)) # __add__ + self.assertEqual(y + X, NormalDist(110, 15)) # __radd__ + self.assertEqual(X - y, NormalDist(90, 15)) # __sub__ + self.assertEqual(y - X, NormalDist(-90, 15)) # __rsub__ + self.assertEqual(X * y, NormalDist(1000, 150)) # __mul__ + self.assertEqual(y * X, NormalDist(1000, 150)) # __rmul__ + self.assertEqual(X / y, NormalDist(10, 1.5)) # __truediv__ + with self.assertRaises(TypeError): + y / X + + def test_immutability_hashability(self): + # Attributes and property are not writeable + nd = statistics.NormalDist(500, 17) + with self.assertRaises(AttributeError): + nd.mu = 600 + with self.assertRaises(AttributeError): + nd.sigma = 34 + with self.assertRaises(AttributeError): + nd.variance = 40 + + # Subclasses can write additional attributes + # but cannot write to the parent attributes + class SD(statistics.NormalDist): + def __init__(self, mu, sigma, n): + super().__init__(mu, sigma) + self.n = n + sd = SD(700, 25, 85) + with self.assertRaises(AttributeError): + sd.mu = 600 + with self.assertRaises(AttributeError): + sd.sigma = 34 + with self.assertRaises(AttributeError): + sd.variance = 40 + self.assertEqual(sd.n, 85) + sd.n = 95 + self.assertEqual(sd.n, 95) + + # Distinct types compare as distinct + self.assertEqual(len({nd, sd}), 2) + sd2 = SD(nd.mu, nd.sigma, 105) + self.assertEqual(len({nd, sd2}), 2) + + # Within a type, both components must agree to be considered the same + nd2 = statistics.NormalDist(nd.mu - 1, nd.sigma) + nd3 = statistics.NormalDist(nd.mu, nd.sigma + 1) + nd4 = statistics.NormalDist(nd.mu, nd.sigma) + self.assertEqual(len({nd, nd2, nd3, nd4}), 3) + + def test_repr(self): + nd = statistics.NormalDist(37.5, 5.625) + self.assertEqual(repr(nd), 'NormalDist(mu=37.5, sigma=5.625)') + # === Run tests === From 780574f585255994486eca364767c66e61699155 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Thu, 21 Feb 2019 23:36:39 -0800 Subject: [PATCH 09/16] Add whatsnew entry and a versionadded directive --- Doc/library/statistics.rst | 3 +++ Doc/whatsnew/3.8.rst | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 11177772f98a15..cf07f540608263 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -569,6 +569,9 @@ of applications in statistics, including simulations and hypothesis testing. Instances of :class:`NormalDist` are :term:`immutable` and :term:`hashable`. + .. versionadded:: 3.8 + + :class:`NormalDist` Examples and Recipes ---------------------------------------- diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index bf7300db094589..2c69f31c636116 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -262,6 +262,10 @@ Added :func:`statistics.fmean` as a faster, floating point variant of :func:`statistics.mean()`. (Contributed by Raymond Hettinger and Steven D'Aprano in :issue:`35904`.) +Added :class:`statistics.NormalDist`, a tool for creating +and manipulating normal distributions of a random variable. +(Contributed by Raymond Hettinger in :issue:`36018`.) + tokenize -------- From b9008c34f16ff0ab68fb18d35f9c19cdddb067ea Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 00:13:19 -0800 Subject: [PATCH 10/16] Add am example to whatsnew --- Doc/whatsnew/3.8.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 2c69f31c636116..586027cc51feb9 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -266,6 +266,28 @@ Added :class:`statistics.NormalDist`, a tool for creating and manipulating normal distributions of a random variable. (Contributed by Raymond Hettinger in :issue:`36018`.) +:: + + >>> temperature_feb = NormalDist.from_samples([4, 12, -3, 2, 7, 14]) + >>> temperature_feb + NormalDist(mu=6.0, sigma=6.356099432828281) + + >>> temperature_feb.cdf(3) # Chance of being under 3 degrees + 0.3184678262814532 + >>> # Relative chance of being 7 degrees versus 10 degrees + >>> temperature_feb.pdf(7) / temperature_feb.pdf(10) + 1.2039930378537762 + + >>> el_nino = NormalDist(4, 2.5) + >>> temperature_feb += el_nino # Add in a climate effect + >>> temperature_feb + NormalDist(mu=10.0, sigma=6.830080526611674) + + >>> temperature_feb * (9/5) + 32 # Convert to Fahrenheit + NormalDist(mu=50.0, sigma=12.294144947901014) + >>> temperature_feb.samples(3) # Generate random samples + [7.672102882379219, 12.000027119750287, 4.647488369766392] + tokenize -------- From a50a7bef7c74b78677bcfd6198f5ec7d078d8bba Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 06:47:10 -0800 Subject: [PATCH 11/16] Default to the standard normal distribution --- Doc/library/statistics.rst | 2 +- Lib/statistics.py | 2 +- Lib/test/test_statistics.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index cf07f540608263..8679099678aca4 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -479,7 +479,7 @@ Normal distributions arise from the `Central Limit Theorem`_ and have a wide range of applications in statistics, including simulations and hypothesis testing. -.. class:: NormalDist(mu, sigma=0.0) +.. class:: NormalDist(mu=0.0, sigma=1.0) Returns a new *NormalDist* object where *mu* represents the `arithmetic mean `_ of data and *sigma* diff --git a/Lib/statistics.py b/Lib/statistics.py index e1039115a22751..bfa26e4be9b56d 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -706,7 +706,7 @@ class NormalDist: __slots__ = ('mu', 'sigma') - def __init__(self, mu, sigma=0.0): + def __init__(self, mu=0.0, sigma=1.0): 'NormalDist where mu is the mean and sigma is the standard deviation' if sigma < 0.0: raise StatisticsError('sigma must be non-negative') diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 4f6650ec899a90..bdae66aa7a211b 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2039,11 +2039,11 @@ def test_instantiation_and_attributes(self): self.assertEqual(nd.sigma, 17) self.assertEqual(nd.variance, 17**2) - # default argument - nd = statistics.NormalDist(400) - self.assertEqual(nd.mu, 400) - self.assertEqual(nd.sigma, 0) - self.assertEqual(nd.variance, 0**2) + # default arguments + nd = statistics.NormalDist() + self.assertEqual(nd.mu, 0) + self.assertEqual(nd.sigma, 1) + self.assertEqual(nd.variance, 1**2) # error case: negative sigma with self.assertRaises(statistics.StatisticsError): From 536f2863cf920e6a6ab2fff4f9ddab9f8164c95c Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 07:51:27 -0800 Subject: [PATCH 12/16] More proofreading --- Doc/library/statistics.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 8679099678aca4..a85fab1c38f557 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -508,7 +508,7 @@ of applications in statistics, including simulations and hypothesis testing. Class method that makes a normal distribution instance from sample data. The *data* can be any :term:`iterable` and should consist of values that can be converted to type - :class:`float` values. + :class:`float`. If *data* does not contain at least two elements, raises :exc:`StatisticsError` because it takes at least one point to estimate @@ -608,7 +608,7 @@ is to guess a person's gender from measurements of normally distributed features including height, weight, and foot size. The `prior probability `_ of -male or female is 50%:: +being male or female is 50%:: >>> prior_male = 0.5 >>> prior_female = 0.5 @@ -627,20 +627,20 @@ with :class:`NormalDist`:: We observe a new person whose feature measurements are known but whose gender is unknown:: - >>> h = 6.0 - >>> w = 130 - >>> f = 8 + >>> ht = 6.0 # height + >>> wt = 130 # weight + >>> fs = 8 # foot size -The posterior is the product of the prior times each the likelihoods of the -gender given a feature measurement:: +The posterior is the product of the prior times each likelihood of a +feature measurement given the gender:: - >>> posterior_male = (prior_male * height_male.pdf(h) * - weight_male.pdf(w) * foot_size_male.pdf(f)) - >>> posterior_female = (prior_female * height_female.pdf(6) * - weight_female.pdf(130) * foot_size_female.pdf(8)) + >>> posterior_male = (prior_male * height_male.pdf(ht) * + weight_male.pdf(wt) * foot_size_male.pdf(fs)) + >>> posterior_female = (prior_female * height_female.pdf(ht) * + weight_female.pdf(wt) * foot_size_female.pdf(fs)) -The final prediction is given to the largest posterior -- this is known as the -`maximum a posterior +The final prediction is awarded to the largest posterior -- this is known as +the `maximum a posteriori`_ or MAP:: >>> 'male' if posterior_male > posterior_female else 'female' From e627983e20c00d98f28c8e243b4e12711467ec1a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 08:40:53 -0800 Subject: [PATCH 13/16] Enable doctests --- Doc/library/statistics.rst | 45 +++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index a85fab1c38f557..32ac2810448844 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -546,7 +546,9 @@ of applications in statistics, including simulations and hypothesis testing. Instances of :class:`NormalDist` support addition, subtraction, multiplication and division by a constant. These operations - are used for translation and scaling. For example:: + are used for translation and scaling. For example: + + .. doctest:: >>> temperature_february = NormalDist(5, 2.5) # Celsius >>> temperature_february * (9/5) + 32 # Fahrenheit @@ -558,7 +560,9 @@ of applications in statistics, including simulations and hypothesis testing. variables, it is possible to `add and subtract two normally distributed random variables`_ - represented as instances of :class:`NormalDist`. For example:: + represented as instances of :class:`NormalDist`. For example: + + .. doctest:: >>> birth_weights = NormalDist.from_samples([2.5, 3.1, 2.1, 2.4, 2.7, 3.5]) >>> drug_effects = NormalDist(0.4, 0.15) @@ -580,24 +584,28 @@ A :class:`NormalDist` readily solves classic probability problems. For example, given `historical data for SAT exams`_ showing that scores are normally distributed with a mean of 1060 and standard deviation of 192, -determine the percentage of students with scores between 1100 and 1200:: +determine the percentage of students with scores between 1100 and 1200: + +.. doctest:: >>> sat = NormalDist(1060, 195) >>> fraction = sat.cdf(1200) - sat.cdf(1100) >>> f'{fraction * 100 :.1f}% score between 1100 and 1200' - '18.2% score between 1100 and 1200 + '18.2% score between 1100 and 1200' To estimate the distribution for a model than isn't easy to solve analytically, :class:`NormalDist` can generate input samples for a `Monte Carlo simulation `_ of the -model:: +model: + +.. doctest:: >>> n = 100_000 >>> X = NormalDist(350, 15).samples(n) >>> Y = NormalDist(47, 17).samples(n) >>> Z = NormalDist(62, 6).samples(n) >>> model_simulation = [x * y / z for x, y, z in zip(X, Y, Z)] - >>> NormalDist.from_samples(model_simulation) + >>> NormalDist.from_samples(model_simulation) # doctest: +SKIP NormalDist(mu=267.6516398754636, sigma=101.357284306067) Normal distributions commonly arise in machine learning problems. @@ -608,14 +616,18 @@ is to guess a person's gender from measurements of normally distributed features including height, weight, and foot size. The `prior probability `_ of -being male or female is 50%:: +being male or female is 50%: + +.. doctest:: >>> prior_male = 0.5 >>> prior_female = 0.5 We also have a training dataset with measurements for eight people. These measurements are assumed to be normally distributed, so we summarize the data -with :class:`NormalDist`:: +with :class:`NormalDist`: + +.. doctest:: >>> height_male = NormalDist.from_samples([6, 5.92, 5.58, 5.92]) >>> height_female = NormalDist.from_samples([5, 5.5, 5.42, 5.75]) @@ -625,23 +637,30 @@ with :class:`NormalDist`:: >>> foot_size_female = NormalDist.from_samples([6, 8, 7, 9]) We observe a new person whose feature measurements are known but whose gender -is unknown:: +is unknown: + +.. doctest:: >>> ht = 6.0 # height >>> wt = 130 # weight >>> fs = 8 # foot size The posterior is the product of the prior times each likelihood of a -feature measurement given the gender:: +feature measurement given the gender: + +.. doctest:: >>> posterior_male = (prior_male * height_male.pdf(ht) * - weight_male.pdf(wt) * foot_size_male.pdf(fs)) + ... weight_male.pdf(wt) * foot_size_male.pdf(fs)) + >>> posterior_female = (prior_female * height_female.pdf(ht) * - weight_female.pdf(wt) * foot_size_female.pdf(fs)) + ... weight_female.pdf(wt) * foot_size_female.pdf(fs)) The final prediction is awarded to the largest posterior -- this is known as the `maximum a posteriori -`_ or MAP:: +`_ or MAP: + +.. doctest:: >>> 'male' if posterior_male > posterior_female else 'female' 'female' From b22d9640b17e6333c454897690feddb1e9d14e65 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 09:17:23 -0800 Subject: [PATCH 14/16] Added pickle/copy support --- Lib/statistics.py | 3 +++ Lib/test/test_statistics.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index bfa26e4be9b56d..5fc70862ea79bd 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -787,6 +787,9 @@ def __setattr__(self, attr, value): raise AttributeError(f"cannot set attribute {attr!r}") super().__setattr__(attr, value) + def __reduce__(self): + return (type(self), (self.mu, self.sigma)) + def __repr__(self): return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bdae66aa7a211b..6eef0d10788624 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -5,9 +5,11 @@ import collections import collections.abc +import copy import decimal import doctest import math +import pickle import random import sys import unittest @@ -2191,6 +2193,15 @@ def test_repr(self): nd = statistics.NormalDist(37.5, 5.625) self.assertEqual(repr(nd), 'NormalDist(mu=37.5, sigma=5.625)') + def test_pickle_and_copy(self): + nd = statistics.NormalDist(37.5, 5.625) + nd1 = copy.copy(nd) + self.assertEqual(nd, nd1) + nd2 = copy.deepcopy(nd) + self.assertEqual(nd, nd2) + nd3 = pickle.loads(pickle.dumps(nd)) + self.assertEqual(nd, nd3) + # === Run tests === From b84a1f7477ec087cf6e6f469514f4fa16a3c7604 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Fri, 22 Feb 2019 15:02:20 -0800 Subject: [PATCH 15/16] Let subclasses compare as equal by default --- Lib/statistics.py | 6 ++--- Lib/test/test_statistics.py | 51 ++++++++++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 5fc70862ea79bd..01a9850cff353a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -775,9 +775,9 @@ def __rsub__(x1, x2): __rmul__ = __mul__ def __eq__(x1, x2): - if x1.__class__ is x2.__class__: - return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) - return NotImplemented + if not isinstance(x2, NormalDist): + return NotImplemented + return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) def __hash__(x1): return hash((x1.mu, x1.sigma)) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 6eef0d10788624..95e1a449c03c1c 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2151,6 +2151,44 @@ def test_translation_and_scaling(self): with self.assertRaises(TypeError): y / X + def test_equality(self): + NormalDist = statistics.NormalDist + nd1 = NormalDist() + nd2 = NormalDist(2, 4) + nd3 = NormalDist() + self.assertNotEqual(nd1, nd2) + self.assertEqual(nd1, nd3) + + # Test NotImplemented when types are different + class A: + def __eq__(self, other): + return 10 + a = A() + self.assertEqual(nd1.__eq__(a), NotImplemented) + self.assertEqual(nd1 == a, 10) + self.assertEqual(a == nd1, 10) + + # All subclasses to compare equal giving the same behavior + # as list, tuple, int, float, complex, str, dict, set, etc. + class SizedNormalDist(NormalDist): + def __init__(self, mu, sigma, n): + super().__init__(mu, sigma) + self.n = n + s = SizedNormalDist(100, 15, 57) + nd4 = NormalDist(100, 15) + self.assertEqual(s, nd4) + + # Don't allow duck type equality because we wouldn't + # want a lognormal distribution to compare equal + # to a normal distribution with the same parameters + class LognormalDist: + def __init__(self, mu, sigma): + self.mu = mu + self.sigma = sigma + lnd = LognormalDist(100, 15) + nd = NormalDist(100, 15) + self.assertNotEqual(nd, lnd) + def test_immutability_hashability(self): # Attributes and property are not writeable nd = statistics.NormalDist(500, 17) @@ -2178,21 +2216,12 @@ def __init__(self, mu, sigma, n): sd.n = 95 self.assertEqual(sd.n, 95) - # Distinct types compare as distinct - self.assertEqual(len({nd, sd}), 2) - sd2 = SD(nd.mu, nd.sigma, 105) - self.assertEqual(len({nd, sd2}), 2) - # Within a type, both components must agree to be considered the same nd2 = statistics.NormalDist(nd.mu - 1, nd.sigma) nd3 = statistics.NormalDist(nd.mu, nd.sigma + 1) nd4 = statistics.NormalDist(nd.mu, nd.sigma) self.assertEqual(len({nd, nd2, nd3, nd4}), 3) - def test_repr(self): - nd = statistics.NormalDist(37.5, 5.625) - self.assertEqual(repr(nd), 'NormalDist(mu=37.5, sigma=5.625)') - def test_pickle_and_copy(self): nd = statistics.NormalDist(37.5, 5.625) nd1 = copy.copy(nd) @@ -2202,6 +2231,10 @@ def test_pickle_and_copy(self): nd3 = pickle.loads(pickle.dumps(nd)) self.assertEqual(nd, nd3) + def test_repr(self): + nd = statistics.NormalDist(37.5, 5.625) + self.assertEqual(repr(nd), 'NormalDist(mu=37.5, sigma=5.625)') + # === Run tests === From ba752f3e52afc644333787d6e42ceb71583566a2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger raymond.hettinger@gmail.com Date: Sat, 23 Feb 2019 00:19:17 -0800 Subject: [PATCH 16/16] Strip away the non-essential. Goodbye immutability and hashability. Simpler is better --- Doc/library/statistics.rst | 8 ++------ Lib/statistics.py | 15 ++------------- Lib/test/test_statistics.py | 33 --------------------------------- 3 files changed, 4 insertions(+), 52 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 32ac2810448844..c1be295cbf9502 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -490,12 +490,11 @@ of applications in statistics, including simulations and hypothesis testing. .. attribute:: mu - A read-only attribute for the mean of a normal distribution. + The mean of a normal distribution. .. attribute:: sigma - A read-only attribute for the standard deviation of a normal - distribution. + The standard deviation of a normal distribution. .. attribute:: variance @@ -570,9 +569,6 @@ of applications in statistics, including simulations and hypothesis testing. >>> f'mu={combined.mu :.1f} sigma={combined.sigma :.1f}' 'mu=3.1 sigma=0.5' - Instances of :class:`NormalDist` are :term:`immutable` and - :term:`hashable`. - .. versionadded:: 3.8 diff --git a/Lib/statistics.py b/Lib/statistics.py index 01a9850cff353a..a73001ac554c88 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -710,8 +710,8 @@ def __init__(self, mu=0.0, sigma=1.0): 'NormalDist where mu is the mean and sigma is the standard deviation' if sigma < 0.0: raise StatisticsError('sigma must be non-negative') - object.__setattr__(self, 'mu', mu) - object.__setattr__(self, 'sigma', sigma) + self.mu = mu + self.sigma = sigma @classmethod def from_samples(cls, data): @@ -779,17 +779,6 @@ def __eq__(x1, x2): return NotImplemented return (x1.mu, x2.sigma) == (x2.mu, x2.sigma) - def __hash__(x1): - return hash((x1.mu, x1.sigma)) - - def __setattr__(self, attr, value): - if type(self) is NormalDist or attr in ('mu', 'sigma'): - raise AttributeError(f"cannot set attribute {attr!r}") - super().__setattr__(attr, value) - - def __reduce__(self): - return (type(self), (self.mu, self.sigma)) - def __repr__(self): return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})' diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 95e1a449c03c1c..a65fbe8dd259f0 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2189,39 +2189,6 @@ def __init__(self, mu, sigma): nd = NormalDist(100, 15) self.assertNotEqual(nd, lnd) - def test_immutability_hashability(self): - # Attributes and property are not writeable - nd = statistics.NormalDist(500, 17) - with self.assertRaises(AttributeError): - nd.mu = 600 - with self.assertRaises(AttributeError): - nd.sigma = 34 - with self.assertRaises(AttributeError): - nd.variance = 40 - - # Subclasses can write additional attributes - # but cannot write to the parent attributes - class SD(statistics.NormalDist): - def __init__(self, mu, sigma, n): - super().__init__(mu, sigma) - self.n = n - sd = SD(700, 25, 85) - with self.assertRaises(AttributeError): - sd.mu = 600 - with self.assertRaises(AttributeError): - sd.sigma = 34 - with self.assertRaises(AttributeError): - sd.variance = 40 - self.assertEqual(sd.n, 85) - sd.n = 95 - self.assertEqual(sd.n, 95) - - # Within a type, both components must agree to be considered the same - nd2 = statistics.NormalDist(nd.mu - 1, nd.sigma) - nd3 = statistics.NormalDist(nd.mu, nd.sigma + 1) - nd4 = statistics.NormalDist(nd.mu, nd.sigma) - self.assertEqual(len({nd, nd2, nd3, nd4}), 3) - def test_pickle_and_copy(self): nd = statistics.NormalDist(37.5, 5.625) nd1 = copy.copy(nd) /raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com/raymond.hettinger@gmail.com