Topic modelling is a type of statistical method used to discover the latent topics that occur in a large collection of documents. It's particularly useful in the fields of text mining and natural language processing (NLP), and has gained significant traction in both the social sciences and DH for the analysis of large textual datasets. It allows researchers to categorise, summarise and understand large bodies of text in a way that would be time-consuming or impossible to achieve through manual analysis. This tool has been used to discover and visualise patterns and themes in a range of documents including poems, novels, newspapers and diaries.
An unsupervised algorithm - specific topics are not predetermined - processes the data to identify clusters of words (topics) according to their co-occurrence within documents. It can provide a structured way of understanding the thematic underpinnings of the corpus.
Benin
Burkina Faso
Python Code
import nltk | |
import requests | |
import re | |
import stanza | |
from gensim import corpora, models | |
import pyLDAvis.gensim_models as gensimvis | |
import pyLDAvis | |
from nltk.corpus import stopwords | |
from tqdm import tqdm | |
# Download necessary NLTK resources | |
nltk.download('stopwords') | |
# Load French stop words | |
french_stopwords = set(stopwords.words('french')) | |
additional_stopwords = {'El', '000', '%'} # Add any other words to remove | |
french_stopwords.update(additional_stopwords) | |
french_stopwords = set(word.lower() for word in french_stopwords) # Ensure all stopwords are lowercase | |
# Initialize Stanza French model | |
nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt,pos,lemma') | |
# Compile regular expressions | |
newline_re = re.compile(r'\n') | |
apostrophe_re = re.compile(r"’") | |
whitespace_re = re.compile(r"\s+") | |
oe_re = re.compile(r"œ") | |
def fetch_items_from_set(item_set_ids): | |
base_url = "https://iwac.frederickmadore.com/api/items" | |
items = [] | |
for set_id in tqdm(item_set_ids, desc="Fetching item sets"): | |
page = 1 | |
while True: | |
response = requests.get(f"{base_url}?item_set_id={set_id}&page={page}") | |
data = response.json() | |
if not data: | |
break | |
items.extend(data) | |
page += 1 | |
return items | |
def extract_texts(items): | |
texts = [] | |
for item in tqdm(items, desc="Extracting texts"): | |
if "bibo:content" in item: | |
content_blocks = item["bibo:content"] | |
for content in content_blocks: | |
if content.get('property_label') == 'content' and content.get('is_public', True): | |
texts.append(content.get('@value', '')) | |
return texts | |
def preprocess_texts(texts): | |
processed_texts = [] | |
for text in tqdm(texts, desc="Preprocessing texts"): | |
text = newline_re.sub(' ', text) | |
text = apostrophe_re.sub("'", text) | |
text = whitespace_re.sub(" ", text) | |
text = oe_re.sub("oe", text) | |
text = text.strip().lower() # Convert to lower case before processing | |
# Process the cleaned text with Stanza | |
doc = nlp(text) | |
tokens = [word.lemma.lower() for sent in doc.sentences for word in sent.words | |
if word.upos not in ['PUNCT', 'SYM', 'X'] and word.lemma.lower() not in french_stopwords] | |
processed_text = ' '.join(tokens) | |
processed_texts.append(processed_text) | |
return processed_texts | |
def perform_lda(texts): | |
dictionary = corpora.Dictionary([text.split() for text in texts]) | |
corpus = [dictionary.doc2bow(text.split()) for text in texts] | |
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, update_every=1, chunksize=100, iterations=50) | |
return lda_model, corpus, dictionary | |
def create_visualization(lda_model, corpus, dictionary, file_name): | |
vis = gensimvis.prepare(lda_model, corpus, dictionary) | |
pyLDAvis.save_html(vis, file_name) | |
def main(): | |
benin_item_sets = [2187, 2188, 2189] | |
burkina_faso_item_sets = [2200, 2215, 2214, 2207, 2201] | |
benin_items = fetch_items_from_set(benin_item_sets) | |
burkina_faso_items = fetch_items_from_set(burkina_faso_item_sets) | |
benin_texts = extract_texts(benin_items) | |
burkina_faso_texts = extract_texts(burkina_faso_items) | |
benin_processed = preprocess_texts(benin_texts) | |
burkina_faso_processed = preprocess_texts(burkina_faso_texts) | |
benin_lda_model, benin_corpus, benin_dictionary = perform_lda(benin_processed) | |
burkina_faso_lda_model, burkina_faso_corpus, burkina_faso_dictionary = perform_lda(burkina_faso_processed) | |
create_visualization(benin_lda_model, benin_corpus, benin_dictionary, 'lda_visualization_benin.html') | |
create_visualization(burkina_faso_lda_model, burkina_faso_corpus, burkina_faso_dictionary, 'lda_visualization_burkina_faso.html') | |
if __name__ == "__main__": | |
main() |