ReplicaX
From Opasnet
Jump to navigation
Jump to search
| Moderator:Jouni (see all) |
|
|
| Upload data
|
Question
How to generate an artificial data from a real data in such a way that
- individuals are no longer identifiable from the data,
- the marginal distributions are close to those of the original data,
- the correlation structure is close to that of the original data?
Answer
Rcode ReplicaX by Juha Karvanen does exactly that.
⇤--arg1500: . Table data cannot be pasted to the code for some reason. --Jouni (talk) 11:33, 18 November 2018 (UTC) (type: truth; paradigms: science: attack)
# name:dat|description:Copy your data table here|type:table| Causes an error.
library(OpasnetUtils)
library(Lmoments)
library(bayesm)
library(MASS)
library(plyr)
library(compiler)
objects.latest("Op_en6248", code_name = "initiate") # Fetch the ReplicaX functions
enableJIT(3)
set.seed(20131103) #to reproduce the results
#Generating replica data
for(i in (1:ncol(dat))[vartype == 'continuous']) {
colnames(dat)[i] <- levels(dat[[i]])[dat[[1 , i]]]
dat[[i]] <- as.numeric(levels(dat[[i]])[dat[[i]]])
}
dat <- dat[2:nrow(dat) , ]
replica <- ReplicaX(dat, vartype, k = k, osigma = osigma)
objects.put(replica)
cat("The replica table is saved. Save the key of the model run to retrieve your data.\n")
if(verbose) {
cat("Copy the replica data from here.\n")
oprint(replica)
} else {
cat("Your replica data looks like this:\n")
oprint(head(replica))
}
|
Show example data and analysis (Making a new run takes ca. 4 minutes)
#Fil: http://www.tilastotiede.fi/ReplicaX/example_MONICA.r
#Author: Juha Karvanen (juha.karvanen at iki.fi)
#
#Summary: This code demonstrates the creation of data replicas with ReplicaX R functions.
#The R functions needed are available as http://www.tilastotiede.fi/ReplicaX/ReplicaX_functions.r
#For information see http://www.tilastotiede.fi/ReplicaX/ReplicaX_esitys.pdf (in Finnish)
#
#The data used is a published sample of WHO MONICA data.
#Before running the code, you should download the data from
#http://www.thl.fi/publications/monica/monograph_cd/data/form04_3.zip
#
#The data documentation can be found at
#http://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm
#http://www.thl.fi/publications/monica/monograph_cd/formats/survey.htm
#
#For more information on the MONICA project see http://www.thl.fi/monica/
library(OpasnetUtils)
objects.latest("Op_en6248", code_name = "initiate") # Fetch the ReplicaX functions
# Fetch the Monica data
monica <- opasnet.csv(filename = "f/f7/Monica.zip", wiki = "opasnet_en", unzip = "Monica.csv", sep = ",", header = TRUE)
library(Lmoments)
library(bayesm)
library(MASS)
library(plyr)
library(compiler)
enableJIT(3)
print("ReplicaX demonstration with the MONICA data. Running the code takes a couple minutes.")
set.seed(20131103) #to reproduce the results
selectedcolnames <- c("marit","edlevel","school","height","weight","waist","hip")
monica2 <- monica[,selectedcolnames]
vartype <- c("discrete","discrete","discrete","continuous","continuous","continuous","continuous")
#Generating replica data
monica2_r <- ReplicaX(monica2,vartype,k=5,osigma=0.5)
#Rounding values to the original precision (Depending on the use case, this step can be skipped.)
monica2_r$height <- round(monica2_r$height)
monica2_r$weight <- round(monica2_r$weight,digits=1)
monica2_r$waist <- round(monica2_r$waist,digits=1)
monica2_r$hip <- round(monica2_r$hip,digits=1)
oprint(head(monica2))
print("Summary statistics:")
print("Original data:")
oprint(summary(monica2))
print("Replica data:")
oprint(summary(monica2_r))
print("The original dataset has 8 individuals with marital status '5, (other)'.
These individuals cannot be identified from the replica dataset.")
marit5ind <- (monica2$marit==5)
print("Original data:")
oprint(monica2[marit5ind,])
print("Corresponding rows in the replica data:")
oprint(monica2_r[marit5ind,])
print("Cross-tabulation")
print("Original data:")
oprint(table(monica2$marit,monica2$edlevel))
print("Replica data:")
oprint(table(monica2_r$marit,monica2_r$edlevel))
print("Means and standard deviations by the marital status:")
print("Original data:")
oprint(ddply(monica2,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
print("Replica data:")
oprint(ddply(monica2_r,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
#Scatter plots with the original and the replica data
plot(monica2$waist,monica2$weight,xlab="Waist circumference (cm)",ylab="Weight (kg)")
points(monica2_r$waist,monica2_r$weight,col="red")
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
plot(monica2$height,monica2$weight,xlab="Height (cm)",ylab="Weight (kg)")
points(monica2_r$height,monica2_r$weight,col="red")
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
plot(monica2$weight/(monica2$height/100)^2,monica2$waist/monica2$hip,
xlab="Body mass index",ylab="Waist/hip ratio")
points(monica2_r$weight/(monica2_r$height/100)^2,monica2_r$waist/monica2_r$hip,col="red")
legend("topright",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
print("Correlation coefficients:")
print("Original data:")
oprint(cor(monica2[,4:7],use="complete.obs"))
print("Replica data:")
oprint(cor(monica2_r[,4:7],use="complete.obs"))
print("End of demonstration.")
|
Rationale
Calculations
Replica R function by Juha Karvanen does exactly that.
library(OpasnetUtils)
ReplicaX <- function(x,vartype,k=5,osigma=0.5)
{
y <- replica_discrete(x,vartype,k=5)
y_continuous <- replica_continuous(x[,vartype=="continuous"],osigma=osigma)
y[,vartype=="continuous"] <- y_continuous
return(y)
}
replica_discrete <- function(x,vartype,k=5)
{
y <- x
n <- nrow(x)
distmat <- rowwise_similarity(x,x,vartype=vartype)
neighbors <- t(apply(distmat,2,FUN=which.maxk,k=k+1)[-1,])
for(i in 1:n)
{
y[i,] <- new_obs2(x[neighbors[i,],],vartype=vartype)
}
return(y)
}
new_obs2 <- function(xx,vartype)
{
xnew <- xx[1,]
k <- nrow(xx)
for(j in 1:ncol(xnew))
{
if(vartype[j]=="continuous")
{
finiteind <- is.finite(xx[,j])
kfinite <- sum(finiteind)
if(kfinite>1)
{
xnew[j] <- sum(rdirichlet(rep(1,kfinite))*xx[finiteind,j])
} else {
xnew[j] <- NA
}
} else {
ind <- as.logical(rmultinom(1,1,prob=rep(1/k,k)))
xnew[j] <- xx[ind,j]
}
}
return(xnew)
}
which.maxk <- function(x,k)
{
return(order(x,decreasing=T)[1:k])
}
observation_similarity <- function(v1,d2,vartype,variance)
{
#v1 one row dataframe, d2 dataframe
p <- ncol(v1)
n <- nrow(d2)
score <- rep(0,n)
for(j in 1:p)
{
if(vartype[j]=="continuous")
{
score <- score + pmax(0,1-(v1[,j]-d2[,j])^2/variance[j],na.rm=T)
} else {
score <- score + as.numeric(as.character(d2[,j])==as.character(v1[,j]))
}
}
return(score)
}
navar <- function(x)
{
p <- ncol(x)
v <- rep(NA,p)
for(j in 1:p)
{
if(is.numeric(x[,j])) v[j] <- var(x[,j])
}
return(v)
}
rowwise_similarity <- function(x,y,vartype)
{
n <- nrow(x)
similmat <- matrix(NA,nrow=n,ncol=nrow(y))
variance <- navar(y)
for(j in 1:n)
{
similmat[j,] <- observation_similarity(x[j,],y,vartype=vartype,variance=variance)
}
return(similmat)
}
kernelCDF <- function(x,n=512,cut=3)
{
bw <- density(x,give.Rkern=T,bw="SJ")
xp <- seq(min(x)-cut*bw,max(x)+cut*bw,length.out=n)
A <- outer(xp,x,pnorm,sd=bw)/length(x)
cdf <- rowSums(A)
return(data.frame(x=xp,cdf=cdf))
}
replica_continuous <- function(x,osigma,umin=0.00000001,umax=0.99999999,kernelweight=0.9)
{
n <- dim(x)[1]
p <- dim(x)[2]
if(length(osigma)==1) osigma<-rep(osigma,p)
u <- matrix(NA,nrow=n,ncol=p)
uparam <- matrix(NA,nrow=n,ncol=p)
ukernel <- matrix(NA,nrow=n,ncol=p)
y <- x
yparam <- x
ykernel <- x
param <- list()
bestpdf <- list()
kcdf <- list()
ind <- 1:n
#Modeling of marginal distributions
for(j in 1:p)
{
xf <- na.omit(x[,j])
ind_finite <- is.finite(x[,j])
if(kernelweight>0 & kernelweight<1)
{
param[[j]] <- data2normpoly4(xf)
uparam[ind_finite,j] <- pnormpoly(x[ind_finite,j],param[[j]])
kcdf[[j]] <- kernelCDF(xf)
pspl <- splinefun(kcdf[[j]]$x,kcdf[[j]]$cdf,method="monoH.FC")
ukernel[ind_finite,j] <- pspl(x[ind_finite,j])
u[ind_finite,j] <- (1-kernelweight)*uparam[ind_finite,j] + kernelweight*ukernel[ind_finite,j]
u[!ind_finite,j] <- NA
}
if(kernelweight==1)
{
kcdf[[j]] <- kernelCDF(xf)
ukernel[ind_finite,j] <- kcdf[[j]]$cdf[ind_finite]
u[ind_finite,j] <- ukernel[ind_finite,j]
u[!ind_finite,j] <- NA
}
if(kernelweight==0)
{
param[[j]] <- data2normpoly4(xf)
uparam[ind_finite,j] <- pnormpoly(x[ind_finite,j],param[[j]])[ind]
u[ind_finite,j] <- uparam[ind_finite,j]
u[!ind_finite,j] <- NA
}
u[ind_finite,j] <- pmin(umax,pmax(u[ind_finite,j],umin))
}
#Generation of correlated noise in Gaussian space
unorm <- qnorm(u)
if(is.null(osigma))
{
dr <- dist(unorm[sample(1:n,1000),])
osigma <- quantile(as.vector(dr),0.001)
}
normcor <- cor(unorm,use="complete.obs")
epsilon <- mvrnorm(n,mu=rep(0,p),Sigma=diag(osigma)%*%normcor%*%diag(osigma))
#Generation of new observations
for(j in 1:p)
{
noisyu <- unorm[,j]+epsilon[,j]
ind_finite <- is.finite(x[,j])
noisyu[ind_finite] <- noisyu[ind_finite]/sd(noisyu[ind_finite])
if(kernelweight>0 & kernelweight<1)
{
yparam[ind_finite,j] <- qnormpoly(pmin(umax,pmax(umin,pnorm(noisyu[ind_finite]))),
param=param[[j]])
qspl <- splinefun(kcdf[[j]]$cdf,kcdf[[j]]$x,method="monoH.FC")
ykernel[ind_finite,j] <- qspl(pnorm(noisyu[ind_finite]))
y[ind_finite,j] <- (1-kernelweight)*yparam[ind_finite,j] + kernelweight*ykernel[ind_finite,j]
}
if(kernelweight==0)
{
yparam[ind_finite,j] <- qnormpoly(pmin(umax,pmax(umin,pnorm(noisyu[ind_finite]))),
param=param[[j]])
y[ind_finite,j] <- yparam[ind_finite,j]
}
if(kernelweight==1)
{
qspl <- splinefun(kcdf[[j]]$cdf,kcdf[[j]]$x,method="monoH.FC")
ykernel[ind_finite,j] <- qspl(pnorm(noisyu[ind_finite]))
y[ind_finite,j] <- ykernel[ind_finite,j]
}
}
names(y) <- names(x)
return(y)
}
objects.store(
ReplicaX,
replica_discrete,
new_obs2,
which.maxk,
observation_similarity,
navar,
rowwise_similarity,
kernelCDF,
replica_continuous
)
cat("Saved objects ReplicaX, replica_discrete, new_obs2, which.maxk, observation_similarity, navar, rowwise_similarity, kernelCDF,
replica_continuous. Remember to load packages Lmoments, bayesm, MASS, plyr, and compiler before using ReplicaX.\n")
|
- Description of ReplicaX in the Apps4Finland contest (in Finnish).
See also
- Monica project: data File:Monica.zip
- Portal:Data
- op_fi:Apps4Finland
Keywords
References
Retrieved from "https://en.opasnet.org/index.php?title=ReplicaX&oldid=42682"