# For text-version => 0.9.99# Install text from CRANinstall.packages("text")library(text)# Set-up en environment with text-required python packagestextrpp_install()# Initialize the environment – and save the settings for next timetextrpp_initialize(save_profile =TRUE)# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Example texttexts<-c("I am feeling relatedness with others", "That's great!")# Defaultsembeddings<-textEmbed(texts)# Outputembeddings$tokens# Outputembeddings$texts# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Look at example data included in the text- package comprising both text and numerical variables (note that there are only 40 participants in this example).Language_based_assessment_data_8# Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings).word_embeddings<-textEmbed(Language_based_assessment_data_8, model ="bert-base-uncased", aggregation_from_layers_to_tokens ="concatenate", aggregation_from_tokens_to_texts ="mean", keep_token_embeddings =FALSE)# See how the word embeddings are structuredword_embeddings# Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data.saveRDS(word_embeddings, "word_embeddings.rds")# Get the saved word embeddings (again)word_embeddings<-readRDS("word_embeddings.rds")# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Get hidden states for "I am fine"imf_embeddings_11_12<-textEmbedRawLayers("I am fine", layers =11:12)imf_embeddings_11_12#OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## 1. Concatenate layers(results in 1,536 dimensions).textEmbedLayerAggregation(imf_embeddings_11_12$context_tokens, layers =11:12, aggregation_from_layers_to_tokens ="concatenate", aggregation_from_tokens_to_texts ="mean")# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## 2. Aggregate layers using mean (results in 768).textEmbedLayerAggregation(imf_embeddings_11_12$context_tokens, layers =11, aggregation_from_tokens_to_texts ="mean")# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Examine the relationship between satisfactiontext and the corresponding rating scalemodel_satisfactiontext_swls<-textTrain( x =word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings) y =Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score. model_description ="author(s): Kjell, Giorgi, & Schwartz; data: N=40, population = Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale")# Examine the correlation between predicted and observed Harmony in life scale scoresmodel_satisfactiontext_swls$results# OUTPUT:# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Save the modesaveRDS(model_satisfactiontext_swls,"model_satisfactiontext_swls.rds")# Read the modelmodel_satisfactiontext_swls<-readRDS("model_satisfactiontext_swls.rds")# Examine the names in the object returned from trainingnames(model_satisfactiontext_swls)#OUTPUT:# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Predicting several outcomes from several word embeddingsmodels_words_ratings<-textTrainLists(word_embeddings$texts[1:2],Language_based_assessment_data_8[5:6])# See resultsmodels_words_ratings$results# OUTPUT# Save modelsaveRDS(models_words_ratings, "models_words_ratings.rds")# Read modelmodels_words_ratings<-readRDS("models_words_ratings.rds")# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Read a valence trained prediction model (download it from https://osf.io/dgczt/)valence_Warriner_L11<-readRDS("valence_Warriner_L11.rds")# Examine the modelvalence_Warriner_L11# PART OF THE OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Apply the model to the satisfaction textsatisfaction_text_valence<-textPredict(valence_Warriner_L11,word_embeddings$texts$satisfactiontexts, dim_names =FALSE)# Examine the correlation between the predicted valence and the Satisfaction with life scale scorepsych::corr.test(satisfaction_text_valence$word_embeddings__ypred,Language_based_assessment_data_8$swlstotal)# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Compute semantic similarity scores between two text columns, using the previously created word_embeddings.semantic_similarity_scores<-textSimilarity(word_embeddings$texts$harmonytexts,word_embeddings$texts$satisfactiontexts)# Look at the first scoreshead(semantic_similarity_scores)# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Read word norms text (later we will use these for the semantic centrality plot)word_norms<-read.csv("Word_Norms_Mental_Health_Kjell2018_text.csv")# Read the word embeddings for the word normsword_norms_embeddings<-readRDS("Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds")# Examine which word norms there are.names(word_norms_embeddings$texts)# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Compute semantic similarity score between the harmony answers and the harmony norm# Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created.norm_similarity_scores_harmony<-textSimilarityNorm(word_embeddings$texts$harmonywords,word_norms_embeddings$texts$harmonynorm)# Correlating the semantic measure with the corresponding rating scalepsych::corr.test(norm_similarity_scores_harmony,Language_based_assessment_data_8$hilstotal)# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Extract word type embeddings and text embeddings for harmony wordsharmony_words_embeddings<-textEmbed( texts =Language_based_assessment_data_8["harmonywords"], aggregation_from_layers_to_tokens ="concatenate", aggregation_from_tokens_to_texts ="mean", aggregation_from_tokens_to_word_types ="mean", keep_token_embeddings =FALSE)# Pre-processing data for plottingprojection_results<-textProjection( words =Language_based_assessment_data_8$harmonywords, word_embeddings =harmony_words_embeddings$texts, word_types_embeddings =harmony_words_embeddings$word_types, x =Language_based_assessment_data_8$hilstotal, y =Language_based_assessment_data_8$age)projection_results$word_data# To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session:options(ggrepel.max.overlaps =1000)# Plotplot_projection<-textPlot(projection_results, min_freq_words_plot =1, plot_n_word_extreme =10, plot_n_word_frequency =5, plot_n_words_middle =5, y_axes =FALSE, p_alpha =0.05, p_adjust_method ="fdr", title_top ="Harmony Words Responses (Supervised Dimension Projection)", x_axes_label ="Low to High Harmony in Life Scale Score", y_axes_label ="", bivariate_color_codes =c("#FFFFFF", "#FFFFFF", "#FFFFFF","#E07f6a", "#EAEAEA", "#85DB8E","#FFFFFF", "#FFFFFF", "#FFFFFF"))# View plotplot_projection$final_plot# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Plotplot_projection_2D<-textPlot(projection_results, min_freq_words_plot =1, plot_n_word_extreme =10, plot_n_word_frequency =5, plot_n_words_middle =5, y_axes =TRUE, # Change to TRUE/FALSE p_alpha =0.05, p_adjust_method ="fdr", title_top ="Harmony Words Responses (Supervised Dimension Projection)", x_axes_label ="Low vs. High Harmony in Life Scale Score", y_axes_label ="Low vs.High Age", bivariate_color_codes =c("#E07f6b", "#60A1F7", "#85DB8D","#FF0000", "#EAEAEA", "#5dc688","#E07f6a", "#60A1F7", "#85DB8E"))# View plotplot_projection_2D$final_plot# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Computing words' centrality (semantic similarity) score to the aggregated embedding of all wordscentrality_results<-textCentrality( words =word_norms$satisfactionnorm, word_embeddings =word_norms_embeddings$texts$satisfactionnorm, word_types_embeddings =word_norms_embeddings$word_types)options(ggrepel.max.overlaps =1000)centrality_plot<-textCentralityPlot( word_data =centrality_results, min_freq_words_test =2, plot_n_word_extreme =10, plot_n_word_frequency =5, plot_n_words_middle =5, title_top ="Satisfaction with life word norm: Semantic Centrality Plot", x_axes_label ="Satisfaction with Life Semantic Centrality")centrality_plot$final_plot# OUTPUT# # # # # # # # # # # # # # # # # # # # # # # # # # # ## Supplementary# PCA results to be plotted help(textPCA)textPCA_results<-textPCA( words =Language_based_assessment_data_8$satisfactionwords, word_types_embeddings =harmony_words_embeddings$word_types)# Plotting the PCA resultsplot_PCA<-textPCAPlot( word_data =textPCA_results, min_freq_words_test =2, plot_n_word_extreme =5, plot_n_word_frequency =5, plot_n_words_middle =5)plot_PCA$final_plot