NLP techniques on Job postings

The below is simply playing around with some NLP ideas on a batch of job descriptions from reed.com that are available on Kaggle:

Download them here

You can follow along with a Jupyter Notebook or simmilar to see these results.

import pandas as pd
import spacy
from spacy.matcher import Matcher
from pprint import pprint
import math
import string
import warnings
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import re
import gensim
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
from gensim.utils import simple_preprocess
import pyLDAvis
import pyLDAvis.gensim
import pickle
warnings.filterwarnings('ignore')

First some preprocessing

We want to have all the job descriptions lemmatized, with common n-gram terms identified.

Load up the file containing job posting and create a list of all the job description contents…

df = pd.read_csv("reed_uk.csv")
raw_jds = df['job_description'].tolist()
# Lemmatization, bigram, trigram, stopword removal, etc. Uncomment to rerun
'''
def jd_to_words(jds):
    for jd in jds:
        yield(gensim.utils.simple_preprocess(str(jd), deacc=True))

data_words = list(jd_to_words(raw_jds))

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram.save("bg_model")
trigram = gensim.models.Phrases(bigram[data_words], threshold=100) 
trigram.save("tg_model")
bigram = gensim.models.phrases.Phrases.load("bg_model")
trigram = gensim.models.phrases.Phrases.load("tg_model")
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
def remove_stopwords(text):
    return([word for word in simple_preprocess(str(text)) if word not in ENGLISH_STOP_WORDS])

def lemmatization(text, allowed_postags):
    doc = nlp_1(" ".join(text)) 
    text_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    text_out = " ".join(text_out)
    return(text_out)
nlp_1 = spacy.load('en', disable=['parser', 'ner'])
df['lemmatized'] = ''
total_rows = df.shape[0]
for index, row in df.iterrows():
    jd = row['job_description'].encode('ascii', errors='ignore').decode()
    jd_no_stops = remove_stopwords(jd)
    jd_tg = trigram_mod[bigram_mod[jd_no_stops]]
    lemmatized = lemmatization(jd_tg, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    df.at[index, "lemmatized"] = lemmatized
df.to_csv("reed_jobs_jd_lemmatized.csv")
df = pd.read_csv("reed_jobs_jd_lemmatized.csv")

LDA Modeling

Now let’s build a LDA model to identify parts of job descriptions that do not pertain to relevent skills information. Then we can then safely ignore these sentences.


jds_list_lemmatized = df['lemmatized'].tolist()
split_jds = [jd.split() for jd in jds_list_lemmatized]
gensim_dict = Dictionary(split_jds)
gensim_dict.save("jd_gensim.dict")
corpus = [gensim_dict.doc2bow(text) for text in split_jds]
MmCorpus.serialize("mmcorpus.mm", corpus)
# load up the "here's some I made earlier" components

gensim_dict = Dictionary.load("jd_gensim.dict")
corpus = MmCorpus("mmcorpus.mm")
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=gensim_dict,
                                           num_topics=25, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model.save("reed_jd_lda_1")
lda_model = gensim.models.ldamodel.LdaModel.load("reed_jd_lda_1")

We can load this up into LDAvis to have a look at the topics that have been identified. We are mainly interested in the general job description ones here (so that we can exclude them) but we’ll name as many as possible anyway

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, 
                                          corpus,
                                          gensim_dict)

with open("lda_vis_prep", 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
with open("lda_vis_prep", 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    pyLDAvis.display(LDAvis_prepared)

pyLDAvis.display(LDAvis_prepared)
topic_tags = {
    1  : "exclude",
    2  : "Customer Service",
    3  : "exclude",
    4  : "Project Managment",
    5  : "managment",
    6  : "exclude",
    7  : "exclude",
    8  : "sales",
    9  : "Finance Administration",
    10 : "exclude",
    11 : "Health and Safety",
    12 : "Manufacturing",
    13 : "Recruitment",
    14 : "Finance Regulatory",
    15 : "Digital Marketing",
    16 : "Technician",
    17 : "Charity / Fundraising",
    18 : "Graduate",
    19 : "Hospitality",
    20 : "Care",
    21 : "Catering",
    22 : "Transport",
    23 : "Education", 
    24 : "Unknown Topic 1",
    25 : "exclude"}
bigram = gensim.models.phrases.Phrases.load("bg_model")
trigram = gensim.models.phrases.Phrases.load("tg_model")
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
nlp_1 = spacy.load('en', disable=['parser', 'ner'])
def is_exclude_sent(sent):
    sent = sent.encode('ascii', errors='ignore').decode()
    sent_no_stops = remove_stopwords(sent)
    sent_tg = trigram_mod[bigram_mod[sent_no_stops]]
    lemmatized = lemmatization(sent_tg, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    bow = gensim_dict.doc2bow(lemmatized.split())
    vector = lda_model[bow][0]
    topics_df = pd.DataFrame(vector, columns=['topic', 'freq'])
    topics_df = topics_df.sort_values('freq', ascending=False)
   # topics_df = topics_df[topics_df["freq"] > min_topic_freq]
    topics_df['topic'] = topics_df['topic'].apply(lambda x : topic_tags[x])
    topics_df = topics_df.set_index('topic')
    return(topics_df)
is_exclude_sent("Key Accountabilities & Responsibilities In association with content editors, support the online delivery of marketing campaigns to drive interest and salesTake ownership of on-site journey, identifying design problems and devise elegant solutions, driving the implementation of these initiatives to continually increase conversion and overall revenue")
freq
topic
Health and Safety 0.206442
Unknown Topic 1 0.185875
Charity / Fundraising 0.140185
Transport 0.096800
managment 0.075793
exclude 0.053018
Project Managment 0.048530
Finance Administration 0.038737
Hospitality 0.031358
Technician 0.024398
Care 0.014776
exclude 0.014093
sales 0.010409

Now we’ll make some helper functions to identify term frequency in a job description, inverse document frequency accross the corpus, and industry key terms (per industry)

Term Frequency

Calculates the ratio of the number of times a word appears to the total length of the job description:

$f_{t, d}$

def TF(description):
    description = description.split() # splits on whitespace
    desc_length = len(description)
    lemmas_dict = {}
    for token in description:
        if token not in lemmas_dict:
            lemmas_dict[token] = 1
        else:
            lemmas_dict[token] += 1
    TF_dict = {k:v/desc_length for k, v in lemmas_dict.items()}
    return(TF_dict)

Inverse Document Frequency

Calculates the natural log of the ratio of the number of descriptions a word appears in to the total number of descriptions.

$\log \frac{N} {n_{t}}$

Uncomment the below 3 cells to re-process (lengthy)

def IDF(desc_list):
    idf_dict = {}
    for ind, desc in enumerate(desc_list):
        desc = desc.split()
        lemmas = []
        for token in desc:
            lemmas.append(token)
        lemmas = list(set(lemmas))
        for l in lemmas:
            if l not in idf_dict:
                idf_dict[l] = 1
            else:
                idf_dict[l] += 1
    idf_dict = {k:math.log(len(desc_list) / v) for k, v in idf_dict.items()}
    return(idf_dict)

idf_dict = IDF(df['lemmatized'].tolist())
print(len(idf_dict.keys()))
IDF_df = pd.DataFrame.from_dict(idf_dict, orient="index", columns=['IDF'])
IDF_df = IDF_df.sort_values(by=['IDF'], ascending=False)
IDF_df.to_csv("IDF.csv")
IDF_df = pd.read_csv("IDF.csv", index_col=0)

TF-IDF

The product of the two functions above, roughly represents the importance of the word in that JD

$f_{t, d} \times \log \frac{N} {n_{t}}$

def TFIDF(JD):
    JD_TF = TF(JD) # note this assumed a preprocessed job description
    TF_df = pd.DataFrame.from_dict(JD_TF, orient="index", columns=["TF"])
    for index, row in TF_df.iterrows():
        try:
            TF_df.at[index, 'TFIDF'] = row['TF'] * IDF_df.at[row.name, "IDF"]
        except KeyError as e:
          #  print("KeyError, ", e)
            TF_df.at[index, 'IFIDF'] = 0.0
    TF_df = TF_df.sort_values(by=['TFIDF'], ascending=False)
    return(TF_df)

Identify Key Terms by Industry

We want to have a list of key terms that are especially relevent to each industry. We can use the same principles as TF-IDF, but simply treat all the job descriptions from a specific industry as a single “document” for the TF component. eg: Term Frequency accross industry * Inverse Document Frequency

$f_{t, i} \times \log \frac{N} {n_{t}}$

def TFI(list_jds):
    lemmas_dict = {}
    total_words = 0
    total_jds = len(list_jds)
    for ind, jd in enumerate(list_jds):
        desc = jd.split() # splits on whitespace
        desc_length = len(desc)
        total_words += desc_length
        for token in desc:
            if token not in lemmas_dict:
                lemmas_dict[token] = 1
            else:
                lemmas_dict[token] += 1
    TF_dict = {k:v/total_words for k, v in lemmas_dict.items()}
    return(TF_dict)

def TFI_IDF(JDs_list):
    TFI_dict = TFI(JDs_list)
    TFI_df = pd.DataFrame.from_dict(TFI_dict, orient="index", columns=["TF"])
    total_rows = TFI_df.shape[0]
    for index, row in TFI_df.iterrows():
        try:
            TFI_df.at[index, 'TFI_IDF'] = row['TF'] * IDF_df.at[row.name, "IDF"]
        except KeyError as e:
          #  print("KeyError, ", e)
            TFI_df.at[index, 'IFI_IDF'] = 0.0 # If something goes wrong here, we just give it a value of zero
    output_df = TFI_df.sort_values(by=['TFI_IDF'], ascending=False)
    return(output_df)  

Make the industry word lists

We’ll run this process on each industry and save it to disk as a csv

industries_list = list(set(df['category'].tolist()))

for ind in industries_list:
    ind_df = df[df['category'] == ind]
    ind_JDs = ind_df['lemmatized'].tolist()
    tfi_idf_df = TFI_IDF(ind_JDs)
    tfi_idf_df.to_csv("key_terms_for_{}.csv".format(ind.replace(' ', '_')))

Compound Terms

We’ve found some n-gram terms with the preprocessing above. We can also consider noun chunks and pattern matching to itentify key terms indise the job descriptions

nlp = spacy.load("en")

def noun_chunks(JD, threshold): 
    desc = nlp(JD)
    exclude_list = []
    for ent in desc.ents:
        if ent.label_ in ['PERSON', 'GPE', 'DATE', 'TIME', 'MONEY', 'LOC']:
            exclude_list.append(ent.text)
        
    tfidf = TFIDF(JD)
    noun_chunks = [chunk for chunk in desc.noun_chunks if len(chunk) > 1]
    nc_df = pd.DataFrame()
    nc_df['chunks'] = pd.Series(noun_chunks)
    for index, row in nc_df.iterrows():
        tfidf_score = 0
        for i in row['chunks']:
            try:
                tfidf_score += tfidf.at[i.lemma_, 'TFIDF']
            except KeyError:
                pass
        nc_df.at[index, 'tfidf_score'] = tfidf_score / len(row['chunks'])
            
    nc_df = nc_df.sort_values(by=['tfidf_score'], ascending=False)
    exc_tags = ['DT', 'PP', 'SYM', 'ADP', 'PRP', 'PRP$', 'POS' ]
    nc_df['chunks'] = nc_df['chunks'].apply(lambda x : '_'.join([w.text for w in x if (not w.is_punct and w.tag_ not in exc_tags)]))
    nc_df = nc_df[(nc_df['tfidf_score'] > threshold)]
    noun_chunks = set(list(nc_df['chunks'].tolist()))
    chunks = ' '.join(noun_chunks)
    return(chunks)
threshold = 0.01
df['noun_chunks'] = ''
num_rows = df.shape[0]
for index, row in df.iterrows():
  #  print("working on row {} of {}".format(index, num_rows))
    jd = row['job_description'].encode('ascii', errors='ignore').decode()
    df.at[index, 'noun_chunks'] = noun_chunks(jd, threshold)
df.to_csv("job_lem_noun_chunks.csv")
def pattern_match(JD, threshold):
    tfidf = TFIDF(JD)
    matcher = Matcher(nlp.vocab)
    pattern = [{'TAG' : 'NN'},
               {'TAG' : 'VBG'}]
    matcher.add("noun_verb_pair", None, pattern)
    desc = nlp(JD)
    matches = matcher(desc)
    score_dict = {}
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = desc[start:end]
        word_list = nlp(span.text)
        score = 0
        for token in word_list:
            try:
                score = tfidf.at[token.lemma_, 'TFIDF']
            except KeyError:
                pass
        score = score / len(word_list)
        score_dict[span.text] = score
        
    return_list = [k for k, v in score_dict.items() if v > threshold]
        
    return(return_list)
threshold = 0.01
df['pattern_matches'] = ''
num_rows = df.shape[0]
for index, row in df.iterrows():
  #  print("working on row {} of {}".format(index, num_rows))
    jd = row['job_description'].encode('ascii', errors='ignore').decode()
    df.at[index, 'noun_chunks'] = pattern_match(jd, threshold)
df.to_csv("job_lem_chunks_patterns.csv")

Filter out unwanted tags

such as peoples names, dates, times, etc

def single_word_filter(JD, threshold):
    desc = nlp(JD)
    single_word_tfidf = TFIDF(JD)
    exclude_list = []
    for ent in desc.ents:
        if ent.label_ in ['PERSON', 'GPE', 'DATE', 'TIME', 'MONEY', 'LOC']:
            exclude_list.append(ent.text)
    exclude_list = [x.split() for x in exclude_list]
    exclude_list = [item for sublist in exclude_list for item in sublist]
    tag_include = ['NN', 'NNS', 'VB', 'VBS', 'VBP', 'VBN', 'VBG']
    token_list = [t for t in desc if t.text not in exclude_list]
    
    token_list = [t for t in desc if t.tag_ in tag_include]
    return_list = []
    for t in token_list:
        try:
            tfidf = single_word_tfidf.at[t.lemma_, 'TFIDF']
        except KeyError:
            continue
        if tfidf > threshold:
            return_list.append(t.lemma_)
    return(list(set(return_list)))
def key_tags(JD):
    threshold = 0.01
    JD = JD.encode('ascii', errors='ignore').decode()
    chunks = noun_chunks(JD, threshold)
    pattern_matches = pattern_match(JD, threshold)
    single_word_tags = single_word_filter(JD, threshold)
    tags = chunks + pattern_matches + single_word_tags
    tags = [t for t in tags if t.lower() not in global_exclude]
    tags = [t for t in tags if len(t) > 1]
    tags = [t for t in tags if len(t.split()) < 5]
    return(tags)

Let’s test it out on some JDs…

for index, row in df.iterrows():
    if row['category'] == 'hr jobs':
        print(row['job_title'])
        print(key_tags(row['job_description']))

Run cell below to add TFIDF tags to all the job postings

for index, row in df.iterrows():
    print(index)
    if index % 50 == 0:
        df.to_csv("jobs_jd_tags_temp.csv")
    tags = key_tags(row['job_description'])
    tags = [x.lower().replace(' ', '_') for x in tags]
    tags_str = ' '.join(tags)
    df.at[index, 'tags'] = tags_str
df.to_csv("jobs_jd_tags.csv")

Outcome

This method has produced some interesting tags, with a lot that are clearly skills. However, there is also a lot of noise, and no clear way to filter it out. This might be improved by training a new Spacy classifier on a labeled dataset. DataTurks is a potential tool for this

Second Pass: Latent Dirichlet Allocation / Topic Modeling

Using topic modeling to identify key areas of the corpus and the accociated topics might yield valuble results / help with the process of categorizing job descriptions.

raw_jds = df['job_description'].tolist()
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(raw_jds))

Rather than using the more complicated noun clustering approach to compound tags, we’ll simply use statistical bigram and trigram identification. Any tags longer than three words will not be caught in this process

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

Some utility functions for processing the text

def remove_stopwords(texts):
    return([[word for word in simple_preprocess(str(doc)) if word not in ENGLISH_STOP_WORDS] for doc in texts])

def make_bigrams(texts):
    return([bigram_mod[doc] for doc in texts])

def make_trigrams(texts):
    return([trigram_mod[bigram_mod[doc]] for doc in texts])

def lemmatization(texts, allowed_postags):
    texts_out = []
    n_texts = len(texts)
    for en, sent in enumerate(texts):
        print("{} out of {} lemmatized".format(en, n_texts), end='\r')
        doc = nlp_1(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return(texts_out)

NB: The below is a somewhat lengthy process, once run, comment it out and rely on the saved data that id loaded up two cells below

data_words_nostops = remove_stopwords(data_words)
# data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_nostops)
nlp_1 = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Create a dictionary of terms from th job descriptions. Corpus is a matrix of the descriptions in which they appear

gensim_dict = Dictionary(data_lemmatized)
gensim_dict.save("jd_gensim.dict")
corpus = [gensim_dict.doc2bow(text) for text in data_lemmatized]
MmCorpus.serialize("mmcorpus.mm", corpus)
gensim_dict = Dictionary.load("jd_gensim.dict")
corpus = MmCorpus("mmcorpus.mm")
mallet_path = "mallet-2.0.8"
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=gensim_dict,
                                           num_topics=25, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model.save("reed_jd_lda")

Using LDAVis we can have a look at the clusters which represent topics

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, 
                                          corpus,
                                          gensim_dict)

with open("lda_vis_prep", 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
with open("lda_vis_prep", 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    pyLDAvis.display(LDAvis_prepared)
pyLDAvis.display(LDAvis_prepared)
def lda_desc(text, min_topic_freq=0.05):
    parsed_text = nlp(text)
    ug_parsed_text = [t.lemma_ for t in parsed_text if not t.is_punct]
    tg_parsed_text = trigram_mod[ug_parsed_text]
    tg_parsed_text = [t for t in tg_parsed_text if t not in ENGLISH_STOP_WORDS]
    text_bow = gensim_dict.doc2bow(tg_parsed_text)
    text_lda = lda_model[text_bow][0]
    topics_df = pd.DataFrame(text_lda, columns=['topic', 'freq'])
    topics_df = topics_df.sort_values('freq', ascending=False)
    topics_df = topics_df[topics_df["freq"] > min_topic_freq]
    topics_df['topic'] = topics_df['topic'].apply(lambda x : topic_names[x])
    topics_df = topics_df.set_index('topic')
    return(topics_df)
import textract
text = textract.process("test.docx").decode()
lda_desc(text)
freq
topic
Unknown Topic 2 0.171854
Business Process 0.106373
Unknown Topic 1 0.099545
Financial 0.089373
Networking 0.060226
Marketing and Digital Media 0.059754
Written on January 9, 2019