# For text-version => 0.9.99
# Install text from CRAN
install.packages("text")
library(text)

# Set-up en environment with text-required python packages
textrpp_install()

# Initialize the environment – and save the settings for next time
textrpp_initialize(save_profile = TRUE)

# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Example text
texts <- c("I am feeling relatedness with others", "That's great!")

# Defaults
embeddings <- textEmbed(texts)

# Output
embeddings$tokens

# Output
embeddings$texts
# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Look at example data included in the text- package comprising both text and numerical variables (note that there are only 40 participants in this example).
Language_based_assessment_data_8

# Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings).
word_embeddings <- textEmbed(
  Language_based_assessment_data_8,
  model = "bert-base-uncased",
  aggregation_from_layers_to_tokens = "concatenate",
  aggregation_from_tokens_to_texts = "mean",
  keep_token_embeddings = FALSE
)

# See how the word embeddings are structured
word_embeddings

# Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data.
saveRDS(word_embeddings, "word_embeddings.rds")

# Get the saved word embeddings (again)
word_embeddings <- readRDS("word_embeddings.rds")

# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Get hidden states for "I am fine"
imf_embeddings_11_12 <- textEmbedRawLayers(
  "I am fine",
  layers = 11:12
)
imf_embeddings_11_12

#OUTPUT


# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# 1. Concatenate layers(results in 1,536 dimensions).
textEmbedLayerAggregation(
  imf_embeddings_11_12$context_tokens,
  layers = 11:12,
  aggregation_from_layers_to_tokens = "concatenate",
  aggregation_from_tokens_to_texts = "mean"
)
# OUTPUT

# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# 2. Aggregate layers using mean (results in 768).
textEmbedLayerAggregation(
  imf_embeddings_11_12$context_tokens,
  layers = 11,
  aggregation_from_tokens_to_texts = "mean"
)

# OUTPUT

# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Examine the relationship between satisfactiontext and the corresponding rating scale
model_satisfactiontext_swls <- textTrain(
  x = word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings)
  y = Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score.
  model_description = "author(s): Kjell, Giorgi, & Schwartz; data: N=40, population =  Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale"
)

# Examine the correlation between predicted and observed Harmony in life scale scores
model_satisfactiontext_swls$results

# OUTPUT:

# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Save the mode
saveRDS(
  model_satisfactiontext_swls,
  "model_satisfactiontext_swls.rds"
)
# Read the model
model_satisfactiontext_swls <- readRDS(
  "model_satisfactiontext_swls.rds"
)

# Examine the names in the object returned from training
names(model_satisfactiontext_swls)

#OUTPUT:

# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Predicting several outcomes from several word embeddings
models_words_ratings <- textTrainLists(
  word_embeddings$texts[1:2],
  Language_based_assessment_data_8[5:6]
)

# See results
models_words_ratings$results

# OUTPUT


# Save model
saveRDS(models_words_ratings, "models_words_ratings.rds")
# Read model
models_words_ratings <- readRDS(
  "models_words_ratings.rds"
)


# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Read a valence trained prediction model (download it from https://osf.io/dgczt/)
valence_Warriner_L11 <- readRDS(
  "valence_Warriner_L11.rds"
)

# Examine the model
valence_Warriner_L11

# PART OF THE OUTPUT

# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Apply the model to the satisfaction text
satisfaction_text_valence <- textPredict(
  valence_Warriner_L11,
  word_embeddings$texts$satisfactiontexts,
  dim_names = FALSE
)

# Examine the correlation between the predicted valence and the Satisfaction with life scale score
psych::corr.test(
  satisfaction_text_valence$word_embeddings__ypred,
  Language_based_assessment_data_8$swlstotal
)


# OUTPUT

# # # # # # # # # # # # # # # # # # # # # # # # # # # #



# Compute semantic similarity scores between two text columns, using the previously created word_embeddings.
semantic_similarity_scores <- textSimilarity(
  word_embeddings$texts$harmonytexts,
  word_embeddings$texts$satisfactiontexts
)
# Look at the first scores
head(semantic_similarity_scores)

# OUTPUT
# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Read word norms text (later we will use these for the semantic centrality plot)
word_norms <- read.csv(
  "Word_Norms_Mental_Health_Kjell2018_text.csv"
)

# Read the word embeddings for the word norms
word_norms_embeddings <- readRDS(
  "Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds"
)

# Examine which word norms there are.
names(word_norms_embeddings$texts)

# OUTPUT
# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Compute semantic similarity score between the harmony answers and the harmony norm
# Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created.
norm_similarity_scores_harmony <- textSimilarityNorm(
  word_embeddings$texts$harmonywords,
  word_norms_embeddings$texts$harmonynorm
)

# Correlating the semantic measure with the corresponding rating scale
psych::corr.test(
  norm_similarity_scores_harmony,
  Language_based_assessment_data_8$hilstotal
)

# OUTPUT
# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Extract word type embeddings and text embeddings for harmony words
harmony_words_embeddings <- textEmbed(
  texts = Language_based_assessment_data_8["harmonywords"],
  aggregation_from_layers_to_tokens = "concatenate",
  aggregation_from_tokens_to_texts = "mean",
  aggregation_from_tokens_to_word_types = "mean",
  keep_token_embeddings = FALSE
)

# Pre-processing data for plotting
projection_results <- textProjection(
  words = Language_based_assessment_data_8$harmonywords,
  word_embeddings = harmony_words_embeddings$texts,
  word_types_embeddings = harmony_words_embeddings$word_types,
  x = Language_based_assessment_data_8$hilstotal,
  y = Language_based_assessment_data_8$age
)

projection_results$word_data

# To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session:
options(ggrepel.max.overlaps = 1000)

# Plot
plot_projection <- textPlot(
  projection_results,
  min_freq_words_plot = 1,
  plot_n_word_extreme = 10,
  plot_n_word_frequency = 5,
  plot_n_words_middle = 5,
  y_axes = FALSE,
  p_alpha = 0.05,
  p_adjust_method = "fdr",
  title_top = "Harmony Words Responses (Supervised Dimension Projection)",
  x_axes_label = "Low to High Harmony in Life Scale Score",
  y_axes_label = "",
  bivariate_color_codes = c("#FFFFFF", "#FFFFFF", "#FFFFFF",
                            "#E07f6a", "#EAEAEA", "#85DB8E",
                            "#FFFFFF", "#FFFFFF", "#FFFFFF"
  )
)
# View plot

plot_projection$final_plot


# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Plot
plot_projection_2D <- textPlot(
  projection_results,
  min_freq_words_plot = 1,
  plot_n_word_extreme = 10,
  plot_n_word_frequency = 5,
  plot_n_words_middle = 5,
  y_axes = TRUE, # Change to TRUE/FALSE
  p_alpha = 0.05,
  p_adjust_method = "fdr",
  title_top = "Harmony Words Responses (Supervised Dimension Projection)",
  x_axes_label = "Low vs. High Harmony in Life Scale Score",
  y_axes_label = "Low vs.High Age",
  bivariate_color_codes = c("#E07f6b", "#60A1F7", "#85DB8D",
                            "#FF0000", "#EAEAEA", "#5dc688",
                            "#E07f6a", "#60A1F7", "#85DB8E"
  )
)
# View plot
plot_projection_2D$final_plot

# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Computing words' centrality (semantic similarity) score to the aggregated embedding of all words
centrality_results <- textCentrality(
  words = word_norms$satisfactionnorm,
  word_embeddings = word_norms_embeddings$texts$satisfactionnorm,
  word_types_embeddings = word_norms_embeddings$word_types
)

options(ggrepel.max.overlaps = 1000)
centrality_plot <- textCentralityPlot(
  word_data = centrality_results,
  min_freq_words_test = 2,
  plot_n_word_extreme = 10,
  plot_n_word_frequency = 5,
  plot_n_words_middle = 5,
  title_top = "Satisfaction with life word norm: Semantic Centrality Plot",
  x_axes_label = "Satisfaction with Life Semantic Centrality"
)

centrality_plot$final_plot

# OUTPUT

# # # # # # # # # # # # # # # # # # # # # # # # # # # #


# Supplementary

# PCA results to be plotted help(textPCA)
textPCA_results <- textPCA(
  words = Language_based_assessment_data_8$satisfactionwords,
  word_types_embeddings = harmony_words_embeddings$word_types
)


# Plotting the PCA results
plot_PCA <- textPCAPlot(
  word_data = textPCA_results,
  min_freq_words_test = 2,
  plot_n_word_extreme = 5,
  plot_n_word_frequency = 5,
  plot_n_words_middle = 5
)
plot_PCA$final_plot