Note
Click here to download the full example code
Estimate a multivariate distribution¶
In this example we are going to estimate a joint distribution from a multivariate sample by fitting marginals and finding a set of copulas.
While the estimation of marginals is quite straightforward, the estimation of the dependency structure takes several steps:
find the dependent components
estimate a copula on each dependent bloc
assemble the estimated copulas
from __future__ import print_function
import openturns as ot
import math as m
ot.Log.Show(ot.Log.NONE)
generate some multivariate data to estimate, with correlation
cop1 = ot.AliMikhailHaqCopula(0.6)
cop2 = ot.ClaytonCopula(2.5)
copula = ot.ComposedCopula([cop1, cop2])
marginals = [ot.Uniform(5.0, 6.0), ot.Arcsine(), ot.Normal(-40.0, 3.0), ot.Triangular(100.0, 150.0, 300.0)]
distribution = ot.ComposedDistribution(marginals, copula)
sample = distribution.getSample(10000).getMarginal([0, 2, 3, 1])
estimate marginals
dimension = sample.getDimension()
marginalFactories = []
for factory in ot.DistributionFactory.GetContinuousUniVariateFactories():
if str(factory).startswith('Histogram'):
# ~ non-parametric
continue
marginalFactories.append(factory)
estimated_marginals = [ot.FittingTest.BestModelBIC(sample.getMarginal(i), marginalFactories)[0] for i in range(dimension)]
estimated_marginals
Out:
[class=Uniform name=Uniform dimension=1 a=4.9999 b=5.99997, class=Normal name=Normal dimension=1 mean=class=Point name=Unnamed dimension=1 values=[-40.0473] sigma=class=Point name=Unnamed dimension=1 values=[3.02509] correlationMatrix=class=CorrelationMatrix dimension=1 implementation=class=MatrixImplementation name=Unnamed rows=1 columns=1 values=[1], class=Triangular name=Triangular dimension=1 a=100.554 m=149.16 b=298.155, class=Beta name=Beta dimension=1 alpha=0.492615 beta=0.484034 a=-1.0002 b=1.0002]
Find connected components of a graph defined from its adjacency matrix
def find_neighbours(head, covariance, to_visit, visited):
N = covariance.getDimension()
visited[head] = 1
to_visit.remove(head)
current_component = [head]
for i in to_visit:
# If i is connected to head and has not yet been visited
if covariance[head, i] > 0:
# Add i to the current component
component = find_neighbours(i, covariance, to_visit, visited)
current_component += component
return current_component
def connected_components(covariance):
N = covariance.getDimension()
to_visit = list(range(N))
visited = [0] * N
all_components = []
for head in range(N):
if visited[head] == 0:
component = find_neighbours(head, covariance, to_visit, visited)
all_components.append(sorted(component))
return all_components
Estimate the copula¶
First find the dependent components : we compute the Spearman correlation
C = sample.computeSpearmanCorrelation()
print(C)
Out:
[[ 1 -0.00628907 -0.00150674 0.239662 ]
[ -0.00628907 1 0.746095 -0.0131841 ]
[ -0.00150674 0.746095 1 -0.00803575 ]
[ 0.239662 -0.0131841 -0.00803575 1 ]]
We filter and consider only significantly non-zero correlations.
epsilon = 1.0 / m.sqrt(sample.getSize())
for j in range(dimension):
for i in range(j):
C[i, j] = 1.0 if abs(C[i, j]) > epsilon else 0.0
print(C)
Out:
[[ 1 0 0 1 ]
[ 0 1 1 1 ]
[ 0 1 1 0 ]
[ 1 1 0 1 ]]
Note that we can apply the HypothesisTest.Spearman test. As the null hypothesis of the test is the independence, we must take the complementary of the binary measure as follow:
>>> M = ot.SymmetricMatrix(dimension)
>>> for i in range(dimension):
>>> M[i,i] = 1
>>> for j in range(i):
>>> M[i, j] = 1 - ot.HypothesisTest.Spearman(sample[:,i], sample[:,j]).getBinaryQualityMeasure()
Now we find the independent blocs:
blocs = connected_components(C)
blocs
Out:
[[0, 1, 2, 3]]
For each dependent block, we estimate the most accurate non parameteric copula.
To do this, we first need to transform the sample in such a way as to keep the copula intact but make all marginal samples follow the uniform distribution on [0,1].
copula_sample = ot.Sample(sample.getSize(), sample.getDimension())
copula_sample.setDescription(sample.getDescription())
for index in range(sample.getDimension()):
copula_sample[:, index] = estimated_marginals[index].computeCDF(sample[:, index])
copulaFactories = []
for factory in ot.DistributionFactory.GetContinuousMultiVariateFactories():
if not factory.build().isCopula():
continue
if factory.getImplementation().getClassName()=='BernsteinCopulaFactory':
continue
copulaFactories.append(factory)
estimated_copulas = [ot.FittingTest.BestModelBIC(copula_sample.getMarginal(bloc), copulaFactories)[0] for bloc in blocs]
estimated_copulas
Out:
[class=NormalCopula name=NormalCopula dimension=4 correlation=class=CorrelationMatrix dimension=4 implementation=class=MatrixImplementation name=Unnamed rows=4 columns=4 values=[1,-0.00655856,-0.00160652,0.249044,-0.00655856,1,0.769256,-0.0138322,-0.00160652,0.769256,1,-0.00830654,0.249044,-0.0138322,-0.00830654,1]]
Finally we assemble the copula
estimated_copula_perm = ot.ComposedCopula(estimated_copulas)
Take care of the order of each bloc vs the order of original components !
permutation = []
for bloc in blocs:
permutation.extend(bloc)
inverse_permutation = [-1] * dimension
for i in range(dimension):
inverse_permutation[permutation[i]] = i
estimated_copula = estimated_copula_perm.getMarginal(inverse_permutation)
estimated_copula
ComposedCopula(NormalCopula(R = [[ 1 -0.00655856 -0.00160652 0.249044 ]
[ -0.00655856 1 0.769256 -0.0138322 ]
[ -0.00160652 0.769256 1 -0.00830654 ]
[ 0.249044 -0.0138322 -0.00830654 1 ]]))
We build joint distribution from marginal distributions and dependency structure:
estimated_distribution = ot.ComposedDistribution(estimated_marginals, estimated_copula)
estimated_distribution
ComposedDistribution(Uniform(a = 4.9999, b = 5.99997), Normal(mu = -40.0473, sigma = 3.02509), Triangular(a = 100.554, m = 149.16, b = 298.155), Beta(alpha = 0.492615, beta = 0.484034, a = -1.0002, b = 1.0002), ComposedCopula(NormalCopula(R = [[ 1 -0.00655856 -0.00160652 0.249044 ]
[ -0.00655856 1 0.769256 -0.0138322 ]
[ -0.00160652 0.769256 1 -0.00830654 ]
[ 0.249044 -0.0138322 -0.00830654 1 ]])))
Total running time of the script: ( 0 minutes 9.226 seconds)