# Neighbor search algorithm
#New constraint selection algorithm
#PRELIMINARIES
setwd("/users/aleksei/documents/documents/umass/gp2")
library(combinat) #for regular expressions
library(mclust) #for mixture of Gaussians cluster analysis
options(stringsAsFactors = F)
#USER-DEFINED VARS
filename <- "tromso_c_sc_1"
which.language <- "toy" #Choose the toy language
#which.language <- "eng" #Choose English
candidate.set <- "full" #For English: all C(C)(C) sequences are allowed in the candidate set
#candidate.set <- "reduced" #For English: only (fricative) - consonants - (liquid) sequences are allowed in the candidate set
cycles <- 15 #the number of cycles the simulation will go through
accuracy_threshold <- 0.01 #this is the max. gain value a constraint must minimally have to be considered "relevant"
#accuracy_threshold <- 0.001
#accuracy_threshold <- 0.005
#accuracy_threshold <- 0.02
sample.size <- 1 #the proportion of omega that's used as a sammple at every cycle of the simulation (to make max gain computation faster)
if (which.language == "eng") {polarity <- c(1,-1)} #If English: Allow both negative and positive constraints
if (which.language == "toy") {polarity <- c(-1)} #If toy language: Allow only negative constraints
#REPRESENTATIONAL SPACE
#For toy language
if (which.language == "toy") {
#define sigma
sigma = c("a","i","u","p","t","k","b","d","g","m","n","q") # q = engma #The segments allowed in the toy language.
supersigma = sigma #these are all elements that may occur in constraints - segments and features
#Define the space of representations (omega) that the grammar will consider
#This will be all possible CVCVC strings with segments drawn from sigma
all.repr <- expand.grid( #a data frame of all possible CVCVCs
c("p","t","k","b","d","g","m","n","q"),
c("a","i","u"),
c("p","t","k","b","d","g","m","n","q"),
c("a","i","u"),
c("p","t","k","b","d","g","m","n","q")
)
#Now make the rows of this data frame into strings, and put them all into a vector
omega <- unlist( apply( all.repr, 1, paste, sep="", collapse="" ) ) #contains 248832 representations total
#Now compute the set of all observed forms given the setup of the toy language
all.obs <- expand.grid(
c("p","t","k","b","d","g"), #first position contains non-nasal consonants
c("a","i","u"), #second position contains only vowels
c("p","t","k","b","d","g","m","n","q"), #third position may contain any consonant
c("a","i","u"), #fourth position contains only vowels
c("p","t","k","b","d","g","n","q") #fifth position contains all consonants except "m"
)
#all.obs <- subset(all.obs, !(all.obs[,2]=="u" & all.obs[,3] %in% c("p","b","m") & all.obs[,4]=="u") ) #Remove all "u"-labial-"u" sequences from this frame
all.obs <- subset(all.obs, !(all.obs[,2] %in% c("u","i") & all.obs[,3] %in% c("p","b","m") & all.obs[,4] %in% c("u","i") ) ) #Remove all "u/i"-labial-"u/i" sequences from this frame
#Now, put everything in a matrix of strings
observed <- unlist( apply( all.obs, 1, paste, sep="", collapse="" ) )
}
#For English case study
if (which.language == "eng") {
sigma = c("p","t","k","b","d","g","f","T","s","h","v","D","z","C","J","m","n","w","r","l","y") #engma is not included, since it doesn't occur in onsets!
#My own transcription: T = theta; D = edh; C = ch; J = dzh; y = yod
#"Supersigma" stands for all the symbols that may be used in random constraint induction - these are all the symbols of sigma plus all clusters induced
supersigma = sigma
if (candidate.set == "reduced") {
fricatives = c("f","T","s","h","v","D","z")
liquids = c("w","r","l","y")
#Representational space: all sequences that fit the template (fricative) - consonant - (liquid)
all.repr <- expand.grid( c(fricatives,""), sigma, c(liquids,""))
}
if (candidate.set == "full") { #if all C(C)(C) sequences are to be in the candidate set
#Representational space: all imaginable onsets = all C(C)(C) sequences
all.repr <- expand.grid( sigma, c(sigma,""), c(sigma,""))
}
#Now make the rows of this data frame into strings, and put them all into a vector
omega <- unlist( apply( all.repr, 1, paste, sep="", collapse="" ) )
#Remove all duplicates from omega
omega <- omega[!duplicated(omega)]
#Observed data for English case study: taken from Hayes & Wilson (2008) ("non-exotic" corpus) - except that "s" and "sh" are collapsed and frequency is ignored.
observed <- c("k", "r", "d", "s", "m", "p", "b", "l", "f", "h", "t", "pr", "w", "n", "v", "g", "J", "st", "tr", "kr", "gr", "C", "br", "sp", "fl", "kl", "sk", "y", "fr", "pl", "bl", "sl", "dr", "kw", "str", "T", "sw", "gl", "hw", "sn", "skr", "z", "sm", "Tr", "skw", "tw", "spr", "sr", "spl", "D", "dw", "gw", "Tw", "skl")
}
#PHONETIC DISTANCE
# define phonetic distance between members of sigma (in the form of a data frame, with both row and column names equal to the set of segments in sigma)
#Toy language
if (which.language == "toy") {
phonetic_distance = data.frame(
#"a"=c(0,3,3,rep(5,9)),
#"i"=c(3,0,3,rep(5,9)),
#"u"=c(3,3,0,rep(5,9)),
"a"=c(0,1,1,rep(5,9)),
"i"=c(1,0,1,rep(5,9)),
"u"=c(1,1,0,rep(5,9)),
"p"=c(rep(5,3), 0,1,2, 1,2,3, 2,3,4),
"t"=c(rep(5,3), 1,0,1, 2,1,2, 3,2,3),
"k"=c(rep(5,3), 2,1,0, 3,2,1, 4,3,2),
"b"=c(rep(5,3), 1,2,3, 0,1,2, 1,2,3),
"d"=c(rep(5,3), 2,1,2, 1,0,1, 2,1,2),
"g"=c(rep(5,3), 3,2,1, 2,1,0, 3,2,1),
"m"=c(rep(5,3), 2,3,4, 1,2,3, 0,1,2),
"n"=c(rep(5,3), 3,2,3, 2,1,2, 1,0,1),
"q"=c(rep(5,3), 4,3,2, 3,2,1, 2,1,0),
row.names=sigma
)
}
#English case study
if (which.language == "eng") {
phonetic_distance <- read.table(file = "english_phonetic_distance.csv", header = T)
}
max_phon_dist <- max(phonetic_distance) #This is the highest value present in the phonetic distance matrix
#CLUSTER SETUP
#Initialize empty data frame of clusters; this vector will later change, which will lead to expansion of the constraint set
clusters_matrix <- matrix( nrow = length(sigma), dimnames = list(sigma) ) #"clusters_matrix" is a matrix which has rows named after foci, and every column in the matrix will be a cluster, which is represented as a probability distribution over those foci
#Set up a vector of possible cluster labels; which are combinations of two capital letters
capital_letters <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z")
cluster_labels <- do.call( paste, c(expand.grid(capital_letters, capital_letters), sep=""))
cluster_translation <- data.frame( Cluster.name = c(), Reg.expresion = c() ) #Data frame which translates cluster labels to regular expressions
#initialize empty vector of clusters and intersections, to be encoded as regular expressions
total_regexp <- c()
#MAXIMUM GAIN COMPUTATION
#I stole this function from Robert's code so I could add a gradient as an attribute to the gain function
grad <- function(x) attr(x, "gradient")
"grad<-" <- function(x,value) {
attr(x, "gradient") <- value
x
}
# Puts an L2 (log-space Gaussian) prior on an optimization
#Taken from Robert's script
l2.prior <- function(fun, var=1, mean=0, ...) {
fun <- match.fun(fun)
function(pv) {
res <- fun(pv) + (sum(pv - mean)^2) / (2 * var)
if ( !is.null(grad(res)) ) grad(res) <- grad(res) + ((pv - mean) / var)
res
}
}
#The following function returns the maximum gain value
max.gain <- function(obs.prob, pred.prob, rep_space, new_constr) { #p is the vector of observed probabilities, q is the vector of probabilities predicted by the current grammar, and "new_constr" is the new constraint whose maximal gain value will be computed
c.star <- violate(new_constr, rep_space) #compute the violation/reward vector of "new_constr" (the new constraint who gain value is to be computed)
gain <- function(w.star) { #a function of the weight of the new constraint
gain.value <- w.star * sum(obs.prob*c.star) - log( sum( exp(w.star*c.star) * pred.prob) ) #minus signs are removed from Wilson (2010) definition, and Della Pietra's definition is followed instead
gain.value <- -gain.value #output the negative of the function, because optim minimizes rather than maximizes functions
grad(gain.value) <- - ( sum(obs.prob * c.star) - (sum( c.star*exp(w.star*c.star)*pred.prob ) / sum( exp(w.star*c.star)*pred.prob ) )) #Della Pietra's first order derivative
gain.value #Output the function with the gradient as an attribute to it
}
gain <- l2.prior(gain, var=100000, mean=0) #Put an L2 prior on maximum gain computation - with variance = 10,000 and mean = 0
opt.gain <- optim( par = 0, fn = gain, lower = 0, method="L-BFGS-B" ) #minimize the "gain" function with the initial value of w.star being 0
-opt.gain$value #Output the negative of the value of "gain" associated with its optimal weight (because "gain" was made negative to allow it to be minimized instead of maximized).
}
#Now, a function which determines the violations for a constraint
#Constraints are a series of cells in a data frame!
violate <- function(constr, rep.space) {
constr.rgxp <- paste(constr[2:4], collapse="") #Bring the characters of the constraint together as a regular expression
#constr.rgxp <- paste(constr[2:5], collapse="") #Bring the characters of the constraint together as a regular expression
matches.vector <- sapply( rep.space, function(cand) { #omega is the full set of candidates in the representational space
match <- gregexpr(constr.rgxp, cand)[[1]]
length( match[match!=-1] ) #Find the number of matches of "const.rgxp" in the candidate; the "match != -1" condition is there because gregexpr returns -1 when there is no match
}) #This generates a vector of the number of matches for every candidate in omega
matches.vector*as.numeric(constr[1]) #multiply the vector of matches by 1 or -1, depending on whether the constraint is positive or negative
}
#MAXIMUM ENTROPY CALCULATION
#Compute observed probabilities
p <- sapply( omega, function(cand){
length(which(observed==cand))/length(observed)
} ) # this is the observed probability (the number of times an element of omega shows up in the observed data divided by the total number of observations)
q <- rep(1/length(omega), length(omega)) # this will be the predicted probability of every form; initially, it will be set to equal probability to every form
prob.frame <- data.frame(omega=omega, p=p, q=q, stringsAsFactors=F) #this is the data frame which will hold p and q for
#cons.matrix will be a matrix in which each column is the violation vector of a constraint; columns will be indexed with the row number corresponding to that constraint in the "constraints" data frame
cons.matrix <- NULL #This will be filled in with constraints as they are induced
#"weights" will be a vector of the weights associated with every constraint in cons.matrix; this vector is provided by the output of the "optim" function
weights <- c() #This will contain the weights of these constraints
#when new constraints are added, as zeroes are added to "weights" as there are new constraints
#the result is fed to "optim" as initial weights for optimization
#Cover function for optimizing the objective function; biases can be turned on or off.
solve <- function(violation.matrix, wts, lower, var=10000) {
w <- wts
obj <- objective.function(violation.matrix)
obj <- l2.prior(obj,var,mean=0)
opt <- optim(w, obj, lower=lower, method="L-BFGS-B")
opt
}
#Objective function based on K-L divergence - based on Robert's "solver.R" script
objective.function <- function(violations) { #plug the cons.matrix matrix into this
objective <- function(w) {
scores <- exp(c(violations %*% w))
Z <- sum(scores)
#actual K-L divergence function
res.df <- data.frame(prob.frame, scores=scores)
res.df$predicted <- log(scores) - log(Z)
res.df$kl <- rep(0,nrow(res.df))
nonzeroes <- which(res.df$p != 0)
res.df$kl[nonzeroes] <- res.df$p[nonzeroes] * (-log(res.df$p[nonzeroes]) + res.df$predicted[nonzeroes])
res <- -sum(res.df$kl)
# res <- -sum(by(data.frame(prob.frame, scores=scores), prob.frame$omega, function(output) { #Apply the following function to prob.frame with the score for every form attached to the right - by output
#
# q.target <- output$p[1] #The target observed probability for an output for each q-value to match
#
# num <- sum(output$scores) #Sum the scores for the output
#
# p.weights <- sum(log(num) - log(Z))
#
# if(q.target == 0.0) {
# 0.0
# }
# else {
# q.target * (-log(q.target) + p.weights)
# }
#
#
# }#endfunction
#
# ))#endsum
den.p <- scores / sum(scores)
den <- colSums(den.p * violations)
num <- violations
grad.matrix <- prob.frame$p * (num - den)
grad(res) <- -colSums(grad.matrix)
# grad(res) <- -colSums(do.call(rbind,
# by(data.frame(p=prob.frame$p, scores=scores, violations=violations), prob.frame$omega[,drop=TRUE],
# #sum over all output forms
# function(output) {
# p <- output$p[1]
# feat <- as.matrix(output[,-(1:2)])
#
# num.p <- output$scores / sum(output$scores)
#
# num <- colSums(num.p * feat)
#
# p * (num - den)
# }, simplify=F), #this is to make sure that "by" spits out a list rather than an array
# ))#endcolSums
res #output the result
}#endfunction
objective #output the function
}#endfunction
#CONSTRAINT SPACE SEARCH
#mutate : a function which introduces mutations from a prototype constraint
mutate <- function(input.cons) { #The "quant" argument signifies whether all mutations or only one of them is required
output.cons <- data.frame(Polarity=c(), First=c(), Second=c(), Third=c()) #set up an empty data frame for the new constraint
method = sample(c(rep("sf.change",3), "pol.change", "ins.del"),1) #Choose the method of mutation: segment/feature change (3x as probable), polarity change, insertion/deletion)
if (method == "sf.change") {#Segment/feature change
new.cons <- input.cons
available_pos <- which( (new.cons[1,] %in% supersigma )) #See which positions in the constraint may be affected by segment/feature change (=those that have a segment or feature (combination) in them).
pos <- sample(c(available_pos,available_pos),1) #select one of these positions to change its content
#I repeat "available_pos" twice because sample(2,1) means the same as sample(1:2,1) - this way sample won't think this is a one-member vector that's to be interpreted as an upper limit
if (new.cons[1,pos] %in% sigma) {#If the thing in the position selected is a segment
phon_dist_threshold <- sample(1:max_phon_dist,1) #This is the maximal
distances <- subset(phonetic_distance, select = new.cons[1,pos]) #find all the distances from a certain segment by selecting the column that is named after that segment
selection <- rownames(subset(distances,(distances <= phon_dist_threshold) & distances > 0)) #find the names of the rows in which the value in the source segment column is smaller than the maximal proximity threshold
selection <- c(selection, selection, supersigma[-(1:length(sigma))]) #The ultimate selection is twice the neighboring segments and once the set of features (so that you're twice as likely to pick from the neighboring segments as you are from the features).
} else { #If the thing in the position selected is a feature
selection <- supersigma[-(1:length(sigma))] #You can choose any feature to replace the
selection <- selection[-which(selection==new.cons[1,pos])] #Remove the original cluster from the selection
}#endif
if (length(selection)==0) { selection <- sigma } #If this leaves nothing - because there are no other clusters induced yet - then select a segment instead
replacement <- sample(selection,1) #pick one random element on the "selection" list which will replace the original segment in the original constraint
new.cons[1,pos] <- replacement
}#endif
if (method == "pol.change") { #Polarity change
new.cons <- input.cons
new.cons$Polarity <- -1*new.cons$Polarity #Flip the polarity of the input constraint
output.cons <- rbind(output.cons, new.cons) #Add this constraint to the output constraint data frame
}#endif
if (method == "ins.del") { #insertion/deletion
new.cons <- input.cons
if (new.cons$Third == "") {
insert.pos <- sample(1:3,1) #Standard case: insertion into any position
if (new.cons$Second == "$") {insert.pos <- sample(1:2,1)} #If the constraint ends in a word boundary, don't insert anything after the word boundary
if (new.cons$First == "^") {insert.pos <- sample(2:3,1)} #If the constraint starts in a word boundary don't insert anything in the first position
if (insert.pos == 1) {
new.cons$Third <- new.cons$Second
new.cons$Second <- new.cons$First
new.cons$First <- sample(c(supersigma,"^"),1) #Insert a segment, feature, or boundary sign into the first position of the constraint.
}
if (insert.pos == 2) {
new.cons$Third <- new.cons$Second
new.cons$Second <- sample(supersigma,1) #Insert a feature or a segment in the middle of the constraint.
}
if (insert.pos == 3) {new.cons$Third <- sample(c(supersigma,"$"),1)} #Insert a segment, feature or boundary sign at the end of the constraint
} else {
new.cons$Third <- ""
}#endif
}#endif
new.cons #This is the output: one single constraint that was changed
}#endfunction
#Now comes the part in which these procedures are actually called.
#initialize a "trash" data frame (which will contain all constraints that have been considered for addition to the grammar, so that no constraint is considered twice)
trash <- data.frame(Polarity = "", First = "", Second = "", Third = "")
################################ START CYCLE ###################################
#From this point on, the code will be repeated a given number of times
model.fit <- sum(subset(prob.frame, prob.frame$p > 0)$q ) #This is the approximation of how well the model fits the data: the total predicted probability on the observed data - this value will be very low at first
iteration <- 1
while (model.fit < 0.95) { #Repeat all of the following as long as model fit is less than 95%
writeLines(paste("\nCycle number ",iteration,".","\n", sep=""))
#Create a sample of the representational space to compute gain values over, to save time
#Currently, the sample is 1 of the rep. space
#size of the sample as a proportion of the total size of omega is given by the variable sample.size, as defined at the beginning of the script
sample.indices <- sample( 1:length(omega), round( sample.size * length(omega)), replace=F ) #take a sample of the positions of elements in omega
sample.p <- p[sample.indices]/sum(p[sample.indices])
sample.q <- q[sample.indices]/sum(q[sample.indices])
sample.omega <- omega[sample.indices]
#RANDOM CONSTRAINT SELECTION
#This repeat loop will ensure that the algorithm keeps going until it finds a random constraint that's not been tried before
repeat {
#First, choose a constraint length between 2 and 4.
constraint_length <- sample(2:3,1)
#Now, create a random constraint of that length
if (constraint_length == 2) {random_constraint <- data.frame( Polarity= sample(polarity,1), First= sample(c(supersigma,"^"),1), Second= sample(c(supersigma,"$"),1), Third="") }
if (constraint_length == 3){random_constraint <- data.frame(Polarity= sample(polarity,1), First=sample(c(supersigma,"^"),1), Second=sample(supersigma,1), Third=sample(c(supersigma,"$"),1)) }
if ( length(trash$Polarity) > 0 & #If the trash data frame has anything in it at all
length( subset(trash,
trash$Polarity==random_constraint$Polarity
& trash$First==random_constraint$First
& trash$Second==random_constraint$Second
& trash$Third==random_constraint$Third
)$Polarity) == 0 ) { #If the randomly generated constraints cannot be found in the data frame containing constraints already considered (= trash),
random.gain <- max.gain(sample.p, sample.q, sample.omega, random_constraint) #compute the gain value of the randomly selected constraint
if ( random.gain >= accuracy_threshold) {#if the gain value isn't good enough, then keep trying until you hit a constraint with a sufficiently high gain value
prototype <- random_constraint #if the constraint is accurate enough, declare its original role number to be the prototype for neighbor search
break #the while statement is in principle an infinite loop; it is halted whenever a fitting prototype is found
} else {
trash <- rbind(trash, random_constraint) #if the constraint isn't accurate enough, add its row number in the original "constraints" frame to "trash"
}#endifelse
}#endif
}#endrepeat
writeLines("Random constraint to intialize search at the current cycle:\n")
print(prototype)
writeLines("\n")
#NEIGHBOR SEARCH
## Now, submit this prototype to the neighbor search procedure.
#Initialize a stack of constraints whose neighbors should be investigated and a frame of outputs
input <- prototype #"stack" is a vector of constraint row numbers whose neighbors should be explored
output_frame <- prototype #"output_frame" is the data frae of all constraints found at this iteration of the search algorithm
current.gain <- random.gain #current.gain is a variable that signifies the highest gain value that has been found up until now
dipping.count <- 0 #dipping.count is a variable that counts how many times one input yields constraints with lower gain values (it's a variable that prevents an infinite loop)
doubling.count <- 0 #doubling.count is the number of times that the loop finds a constraint that is in local trash, in a row
local.trash <- trash #Define a local trash data frame - one that is internal to the neighbor search procedure and which will not be carried over into future cycles of the simulation
#This data frame also contains the regular trash data frame, so that neighbor search never ends up with a constraint which has previously been added to the grammar.
while (nrow(input) > 0 & dipping.count <= 10 & doubling.count <= 10) { #as long as there is an input constraint and the algorithm hasn't found constraints with lower gain values more than 10 times for the same input
#while (nrow(input) > 0 & dipping.count <= 25 & doubling.count <= 10) { #as long as there is an input constraint and the algorithm hasn't found constraints with lower gain values more than 25 times for the same input
current_constraint <- input[1,] #consider the input constraint as the current constraint
#print(cbind(current_constraint, current.gain))
neighbor_constraint <- mutate(current_constraint) #Introduce a single mutation
if ( nrow( subset( local.trash, #If the neighboring constraint is not on local trash
local.trash$Polarity==neighbor_constraint$Polarity
& local.trash$First==neighbor_constraint$First
& local.trash$Second==neighbor_constraint$Second
& local.trash$Third==neighbor_constraint$Third
) ) == 0 ) {
doubling.count <- 0
local.trash <- rbind(local.trash,neighbor_constraint) #put the neighbor constraint in the "trash" frame so it isn't tried again
input <- input[-1,] #erase the current input, so that a new constraint can be put in its place
neighbor.gain <- max.gain(prob.frame$p, prob.frame$q, omega, neighbor_constraint) #Compute the information gain value of the neighboring constraint.
if (neighbor.gain < current.gain) { #If this neighbor has a lower gain value than the current reference value
input <- current_constraint #Try the current constraint as an input once again
dipping.count <- dipping.count + 1
}#endif
if (neighbor.gain == current.gain) { #If this neighbor has the same gain value as the input constraint
output_frame <- rbind(output_frame, neighbor_constraint) #Add the neighboring constraint to the output frame alongside the input
input <- neighbor_constraint #Also make the neighboring constraint the current input
dipping.count <- 0
}#endif
if (neighbor.gain > current.gain) { #If this neighbor has the same gain value as the input constraint
output_frame <- neighbor_constraint #Overwrite the output frame with the neighbor constraint
input <- neighbor_constraint #Also make the neighboring constraint the current input
current.gain <- neighbor.gain #Make this new gain value the standard for future comparison
dipping.count <- 0
}#endif
} else { doubling.count <- doubling.count + 1 } #endifelse
}#endwhile
trash <- rbind(trash, output_frame) #Add the selected constraints to trash
writeLines("Constraint(s) selected at this cycle:\n")
print(output_frame)
writeLines("\n")
# put everything in a focus-by-context table
#First, determine all the contexts, and put these in as the first 4 (or 5) columns of a data frame.
to_be_tabulated <- output_frame[,1:4] #this is the frame of all constraints that were selected by the l algorithm
#to_be_tabulated <- output_frame[,1:5] #this is the frame of all constraints that were selected by the neighbor search algorithm
# make all possible replacements of non-zero elements with a placeholder "_"
contexts_frame <- data.frame("Polarity" = c(), "First" = c(), "Second" = c(), "Third" = c())
#contexts_frame <- data.frame("Polarity" = c(), "First" = c(), "Second" = c(), "Third" = c(), "Fourth" = c())
longest <- constraint_length + 1 #What is the length of all the constraints in the output frame? That length plus one is the longest you have to search to make the context-by-focus table
for (cons in 1:length(to_be_tabulated[,1])) { #for every constraint in the "to be tabulated" data frame
for (j in 2:longest) { #for every position in the constraint
context <- to_be_tabulated[cons,] #take out the row of the contexts_frame that's to be changed now
if ((context[ , j] != "^") & (context[ , j] != "$") ) { #if the current symbol of the constraint is not a word boundary
context[ , j] <- "_" #replace the constraint with "_" in the appropriate position
contexts_frame <- rbind(contexts_frame, context) #add it to the "contexts_frame" data frame
}
}
}
#Get rid of duplicates, so that the list of contexts is as short as possible
contexts_frame <- subset(contexts_frame, !duplicated(contexts_frame)) #this gets rid of all the rows which are the exact same as some row above them
#Now, fill out the focus-by-context data frame by substituting every possible segment into the blanks and computing the corresponding gain value
for (segment in sigma) { #for every possible segment, put it in the gap left in the context and look up the corresponding gain value
column_number <- length(contexts_frame) + 1 #this is the number of the column that needs to be added to the contexts frame right now
focus.substitution.frame <- contexts_frame[,1:4]
#focus.substitution.frame <- contexts_frame[,1:5]
focus.substitution.frame[,2:4] <- apply(focus.substitution.frame[,2:4], 2, function(x){sub("_",segment,x)}) #substitute the current segment value instead of "_" in each context of the contexts_frame
#focus.substitution.frame[,2:5] <- apply(focus.substitution.frame[,2:5], 2, function(x){sub("_",segment,x)}) #substitute the current segment value instead of "_" in each context of the contexts_frame
new_column <- apply( focus.substitution.frame, 1, function(x){ max.gain(sample.p, sample.q, sample.omega, x) } ) #compute the gain value for each constraint made by filling in every context by the focus under scrutiny now
contexts_frame[,column_number] <- new_column
names(contexts_frame)[column_number] <- segment
}
#print(contexts_frame)
#CLUSTERING
#The "find_cluster" function applies to a row of "contexts_frame" to find the content of the cluster with the higher mean
find_cluster <- function(rownumber) {
to_be_clustered <- contexts_frame[rownumber,-(1:4)] #this is the row of the contexts frame over which the one-dimensional clustering analysis will be performed - and the non-numerical values at the left hand side of the frame are excluded
#to_be_clustered <- contexts_frame[rownumber,-(1:5)] #this is the row of the contexts frame over which the one-dimensional clustering analysis will be performed - and the non-numerical values at the left hand side of the frame are excluded
noise_vector <- sample(c(0 , 0.0000001), length(to_be_clustered), replace = T) #make a random vector of the same length as "to be clustered", with either 0 or 0.0000001 as values
#This is done so that the clustering algorithm will actually find two clusters in a case like "0 0 0 0 0 0 0 0 0 1 1 1".
to_be_clustered <- to_be_clustered + noise_vector #the noise vector is added to the "to_be_clustered" values (added and not subtracted, so that negative values are not created)
#obtain the parameters for the two Gaussians that were fit, so that these can be submitted to the expectation maximization algorithm
if ( !is.na(mclustBIC(to_be_clustered,G=1:2,modelNames="E")[2,]) ) { #If it is at all possible to fit a 2-Gaussian model to the data from a given row, then go ahead and find the following values:
row_Mclust <- Mclust( data = to_be_clustered, #take all the columns except the first four
G = 2, #look for 2 components
modelNames = "E" ) #use a model in which the variance of both components has to be the same
if ( length(which(row_Mclust$classification==2)) >= 2 ) { #if, for the cluster with the higher mean, there is are at least two
cluster_content <- row_Mclust$z[,2] #Take the cluster with the highest mean and
return(cluster_content)
} #If the "cluster" contained just one element, then don't return anything
} #Endif with the condition that it's possible to fit a two-component model
} #endfunction
#Now, call this function for every row number in the "contexts_frame" frame
all_clusters <- lapply(1:length(contexts_frame$First), find_cluster)
#And after this, get rid of all the members of this list which are NULL.
to_delete <- c()
for (member in 1:length(all_clusters) ) { if (is.null(all_clusters[[member]])) {to_delete <- c(to_delete, member)} } #Find all the members of the all_clusters
if (length(to_delete) > 0) { all_clusters <- all_clusters[-to_delete] }
#print(all_clusters)
#Procedure for integrating these clusters with previously established clusters (unfinished)
#Then take these clusters, see if they've been established as clusters before, and if not, and if they're "significantly" different from other established clusters, then give them a unique label and add them to the list of clusters
if (length(all_clusters) > 0 & all( is.na(clusters_matrix[,1])) ) { #If no clusters have been added to the clusters matrix yet, and there are clusters to be added.
clusters_matrix[,1] <- all_clusters[[1]]
label <- sample(cluster_labels, 1)
colnames(clusters_matrix)[1] <- label
all_clusters <- all_clusters[-1]
cluster_labels <- cluster_labels[-which(cluster_labels==label)]
}
#now start comparing every cluster in "all clusters" to existing clusters in "clusters_matrix" and add them to the matrix whenever they are sufficiently different; if they are not sufficiently different, merge the current cluster with the cluster in "cluster_matrix" that it is most similar to
cluster_similarity <- function(clust1, clust2) { #The distance metric between two clusters (vectors of probabilities assigned to members of sigma) suggested in the meeting on April 3rd
sum( clust1/sqrt(sum(clust1^2)) * clust2/sqrt(sum(clust2^2)) )
}
similarity_threshold <- .9
#Now, integrate all clusters into the clusters_matrix
while (length(all_clusters) > 0) {
similarities <- apply( clusters_matrix, 2, cluster_similarity, all_clusters[[1]]) #compute the K-L divergence of the various clusters already present in the cluster matrix given the current cluster you're trying to add
maximal_similarity <- max(similarities) #Find the greatest similarity value between the cluster to be added and any other given cluster already in the clusters matrix
clusters_matrix <- cbind(clusters_matrix, all_clusters[[1]]) #add the cluster in question as a new column to the clusters matrix
if (maximal_similarity < similarity_threshold) { #if the maximal similarity of the new cluster to any other cluster is not large enough
label <- sample(cluster_labels, 1) #choose a random, unique label for it
colnames(clusters_matrix)[ncol(clusters_matrix)] <- label #rename the new column to the label just found
all_clusters <- all_clusters[-1] #remove the first cluster on the "all_clusters" list
cluster_labels <- cluster_labels[-which(cluster_labels==label)] #remove the label just assigned from the range of cluster labels which may be assigned
} else { #if the maximal similarity is at or above the threshold
cluster_to_be_modified <- which(similarities==maximal_similarity) #find the position of the maximally similar cluster in the cluster matrix
#If there's more than one column in clusters_matrix that the current cluster is maximally similar to, just pick a value; even when the cluster at hand is equally similar to two clusters with different labels.
pick_one <- sample(length(cluster_to_be_modified),1)
cluster_to_be_modified <- cluster_to_be_modified[pick_one] #pick one random member of "cluster_to_be_modified"
old_label <- names(cluster_to_be_modified) #The vector "cluster_to_be_modified" comes with names for each value that correspond to the name of the cluster the values stand for; so, "old_label" is the cluster label associated with the cluster that the currently considered cluster is about to be merged with.
colnames(clusters_matrix)[ncol(clusters_matrix)] <- old_label
all_clusters <- all_clusters[-1]
}
}
#print(clusters_matrix)
#UPDATE GRAMMAR
#Now, add the constraints in the neighborhood set to the grammar!
num.of.new.cons <- length(output_frame$Polarity) #This is how many new constraints are about to be added to the grammar
while (length(output_frame$Polarity) > 0 ) {
if (!exists("cons.matrix")) { #if cons.matrix has not yet been created
cons.matrix <- matrix(
data = violate(output_frame[1,], rep.space=omega), #take the first constraint in the output vector, and enter the violations of that constraint as the first column in the constraint
ncol = 1,
dimnames = list(omega, paste(unlist(output_frame[1,]),collapse="") ) #the rows should be named after the members of omega; the column is named after the number of the constraint in "constraints"
)
} else {
cons.matrix <- cbind(cons.matrix, violate(output_frame[1,], rep.space=omega) ) #add the vector of violations of the constraint under consideration (obtained through looking it up in "constraints") to the constraint matrix
colnames(cons.matrix)[ncol(cons.matrix)] <- paste(unlist(output_frame[1,]),collapse="") #the names of the constraint matrix columns are the shortened constraint definitions
}
output_frame <- output_frame[-1,] #remove the first line of "output_frame", which had just been added to the grammar
}
#Optimize the weights of these constraints
#If the variable "weights" already exists, then simply append as much zeroes as there are new constraints to "weights"; so that the initial weights given to the optimization algorithm are the optimal weights for old constraints and zeroes for new constraints
#Otherwise, create "weights" and put as many zeroes in it as there are new constraints.
if (exists("weights")) {weights <- c(weights, rep(0, num.of.new.cons) ) } else { weights <- rep(0, num.of.new.cons) }
#Give lower bounds to constraint weights to the optimization algorithm
lower <- rep(0, length(weights))
optimized <- solve(cons.matrix, weights, lower=lower, var=1000) #Optimize the weights of the constraints currently in the grammar
#print(optimized)
weights <- optimized$par #read off the optimized weights
q <- exp(c(cons.matrix %*% weights)) / sum( exp(c(cons.matrix %*% weights)) ) #compute the probabilities of the things in q on the basis of the weights just computed
prob.frame$q <- q #Update the predicted probabilities in the master data frame.
#Spit out the currently generated grammar
writeLines("Constraint weights assigned at this cycle:\n")
print(data.frame(constraints=colnames(cons.matrix), weights=weights))
writeLines("\n")
#Delete all constraints that were assigned 0 weight
#zero.weights <- which(weights==0) #Find the spots in the "weights" vector which are 0
#cons.matrix <- cons.matrix[ , -zero.weights] #Delete the constraints which were assigned zero weight from the grammar
#weights <- weights[-zero.weights] #Delete zero weights for the current weights vector
#UPDATE CLUSTERS
#When the distributions have been updated, they are projected into statements of discrete sets of segments which are part of every label
#This should be done as follows: all columns of "clusters_matrix" that have a certain label are collected, and the average of probabilities is taken; given this result, all segments which have a probability above a certain cutoff value (for instance, .9) are assigned to that cluster; the content of a cluster may change over the course of the simulation
#The set of all possible constraints is then updated accordingly
cluster_names <- colnames(clusters_matrix)[!(duplicated(colnames(clusters_matrix)))] #Find all unique cluster labels in clusters_matrix
#Write a procedure which will average the values of all columns associated with a particular label, and project from the resulting probabilities a set of segments
new_cluster <- c() #This is to be the vector of clusters that were just added at this cycle of the simulation
for (name in cluster_names) {
homonym_matrix <- clusters_matrix[,which(colnames(clusters_matrix)==name)] #find all the columns in clusters_matrix which correspond to the same label
if (length(which(colnames(clusters_matrix)==name)) > 1) {
homonym_average <- rowSums(homonym_matrix)/ncol(homonym_matrix) #average over all the columns which correspond to the label in question
} else {
homonym_average <- homonym_matrix #if the homonym matrix only has one column, then the average of the matrix is the same as the matrix itself
}
name_content <- names(homonym_average[which(homonym_average > 0.5)]) #The segments that belong to a label are those which belong to that label with more than 0.6 probability.
reg.expression <- paste(name_content,sep="", collapse="") #Put all the segments that belong to the cluster together
reg.expression <- paste("[",reg.expression,"]",sep="") #Put brackets around them so they form an actually usable regular expression
if ( length(which(cluster_translation$Cluster.name==name)) > 0 ) { #if the cluster label is already in cluster_translation
cluster_translation[which(cluster_translation$Cluster.name==name), 2] <- reg.expression #then just overwrite the value for the reg expression associated with that label
} else {
cluster_translation <- rbind(cluster_translation, data.frame( Cluster.name = name, Reg.expression = reg.expression ) )
new_cluster <- c(new_cluster, name) #only clusters that have just been added are part of "new_cluster"
}
}
#COMPUTE INTERSECTIONS OF FEATURE CLASSES
if ( length(new_cluster) > 0 ) { #If there are any new clusters
#For every new cluster, find its intersections with all other clusters in cluster_translation
new_cluster_regexp <- subset(cluster_translation, cluster_translation$Cluster.name %in% new_cluster)$Reg.expression #Find all the regexpressions corresponding to the new clusters
cluster_regexp <- cluster_translation$Reg.expression #all reg expressions generated up until now
two_regexp_combinations <- expand.grid(cluster_regexp, new_cluster_regexp, "", stringsAsFactors = F) #this creates all the combinations of 2 regexpressions, whose intersection is to be computed
three_regexp_combinations <- expand.grid(cluster_regexp, cluster_regexp, new_cluster_regexp, stringsAsFactors = F) #this creates all the combinations of regexpressions, whose intersection is to be computed
regexp_combinations <- rbind(two_regexp_combinations, three_regexp_combinations) #These are all combinations of new regexpressions with new & old regexpressions that have 2 or 3 members
#Procedure for computing the reg
compute.intersections <- function(rowcontents) { #Take three reg. expressions
rowcontents <- as.character(rowcontents) #
rowcontents.2 <- lapply(rowcontents, function(x){ #Make sure the data frame cell is a character, not a factor level
reg.char <- unlist(strsplit(x, ""))
reg.char <- reg.char[-c(1,which(reg.char=="]"))]
})
#For each reg.expression, get its characters, and chop off the brackets at the begining and end
#Intersect the first two reg.expressions.
intersection1 <- intersect(rowcontents.2[[1]], rowcontents.2[[2]])
#Now, if the third reg. expression is non-empty, then compute the intersection of the first two reg.exp and the third reg. exp; otherwise, do nothing
if (length(rowcontents.2[[3]]) > 0) { intersection2 <- intersect(intersection1, rowcontents.2[[3]]) } else { intersection2 <- intersection1 }
#If the resulting intersection is non-null, paste it together as a regular expression
if (length(intersection2) > 0) {
result <- paste(intersection2, collapse="", sep="")
paste("[", result, "]", collapse="",sep="")
}
} #endfunction
all_intersections <- unlist(apply(regexp_combinations, 1, compute.intersections))
#Find all unique intersections
unique_intersections <- unique(all_intersections)
#Remove all intersections that are identical to earlier created clusters....
unique_intersections <- unique_intersections[-which(unique_intersections %in% total_regexp)]
#"New feature combinations" are those that are either newly found clusters, or new intersections of features.
new_feature_combinations <- c(new_cluster_regexp, unique_intersections)
#record the clusters and cluster intersections just created in the big book of clusters and intersections
total_regexp <- c(total_regexp, new_feature_combinations)
#UPDATE CONSTRAINT SPACE
#Add the new features to supersigma, so they can be used in new constraints
supersigma <- c(supersigma, new_feature_combinations)
} else { #endif conditional on there being some new clusters
new_cluster_regexp <- c() #If there's no clusters to be found at this iteration, output a zero new_cluster_regexp
}
writeLines("Total set of phonological classes induced up until now:\n")
print( cluster_translation$Reg.expression )
writeLines("\n")
#COMPUTE FIT OF MODEL
#The fit of the model to the data is approximated by computing the sum of the predicted probabilities of the observed candidates
model.fit <- sum(subset(prob.frame, prob.frame$p > 0)$q )
writeLines("Total likelihood of observed data:\n")
print( model.fit )
writeLines("\n")
#Now, loop this back into the random constraint search procedure.
iteration <- iteration + 1 #This variable will show in the output that a new cycle has begun
print("not done yet")
} #This bracket closes the giant while-loop that this whole part of the code is enclosed in, so that there is a loop from updating the constraint space to random constraint selection.
run <- run + 1
final.grammar <- subset(data.frame(constraints=colnames(cons.matrix), weights=weights), weights!=0) #The final grammar consists of all non-zero weight constraints plus their non-zero weights
final.grammar$data_likelihood <- rep(model.fit,nrow(final.grammar)) #add the total data likelihood of the current run as a factor to the data frame
final.grammar$final_cycle <- rep(iteration-1, nrow(final.grammar)) #add the cycle number as a factor to the data frame
final.grammar$run <- rep(run, nrow(final.grammar)) #add the number of the current run as a factor in the data frame
phonological_classes <- cluster_translation$Reg.expression #obtain the phonological classes induced at this run
phonological_classes <- paste(phonological_classes, collapse = "") #paste these phonological classes together in one string
final.grammar$phonological_classes <- rep(phonological_classes, nrow(final.grammar)) #put these in as an extra column in the data frame
write.table(final.grammar, file=paste(filename,".csv",sep=""), append=T, row.names=F) #write the final grammar to file