%%bash

# parameters
n=1 # ngram
l=chi-sim # language (see section 3 for language codes)
yearLowerBound=1900 # start year
yearUpperBound=2008 # end year
wordCountLowerBound=1 # number of word counts to filter
bookCountLowerBound=1 # number of book counts to filter

# download
./downloadAndFilter.ngram.sh $n $l $yearLowerBound $yearUpperBound $wordCountLowerBound $bookCountLowerBound

googlebooks-chi-sim-all-1gram-20120701-a
googlebooks-chi-sim-all-1gram-20120701-a.filtered
googlebooks-chi-sim-all-1gram-20120701-b
googlebooks-chi-sim-all-1gram-20120701-b.filtered
googlebooks-chi-sim-all-1gram-20120701-c
googlebooks-chi-sim-all-1gram-20120701-c.filtered
googlebooks-chi-sim-all-1gram-20120701-d
googlebooks-chi-sim-all-1gram-20120701-d.filtered
googlebooks-chi-sim-all-1gram-20120701-e
googlebooks-chi-sim-all-1gram-20120701-e.filtered
googlebooks-chi-sim-all-1gram-20120701-f
googlebooks-chi-sim-all-1gram-20120701-f.filtered
googlebooks-chi-sim-all-1gram-20120701-g
googlebooks-chi-sim-all-1gram-20120701-g.filtered
googlebooks-chi-sim-all-1gram-20120701-h
googlebooks-chi-sim-all-1gram-20120701-h.filtered
googlebooks-chi-sim-all-1gram-20120701-i
googlebooks-chi-sim-all-1gram-20120701-i.filtered
googlebooks-chi-sim-all-1gram-20120701-j
googlebooks-chi-sim-all-1gram-20120701-j.filtered
googlebooks-chi-sim-all-1gram-20120701-k
googlebooks-chi-sim-all-1gram-20120701-k.filtered
googlebooks-chi-sim-all-1gram-20120701-l
googlebooks-chi-sim-all-1gram-20120701-l.filtered
googlebooks-chi-sim-all-1gram-20120701-m
googlebooks-chi-sim-all-1gram-20120701-m.filtered
googlebooks-chi-sim-all-1gram-20120701-n
googlebooks-chi-sim-all-1gram-20120701-n.filtered
googlebooks-chi-sim-all-1gram-20120701-o
googlebooks-chi-sim-all-1gram-20120701-o.filtered
googlebooks-chi-sim-all-1gram-20120701-p
googlebooks-chi-sim-all-1gram-20120701-p.filtered
googlebooks-chi-sim-all-1gram-20120701-q
googlebooks-chi-sim-all-1gram-20120701-q.filtered
googlebooks-chi-sim-all-1gram-20120701-r
googlebooks-chi-sim-all-1gram-20120701-r.filtered
googlebooks-chi-sim-all-1gram-20120701-s
googlebooks-chi-sim-all-1gram-20120701-s.filtered
googlebooks-chi-sim-all-1gram-20120701-t
googlebooks-chi-sim-all-1gram-20120701-t.filtered
googlebooks-chi-sim-all-1gram-20120701-u
googlebooks-chi-sim-all-1gram-20120701-u.filtered
googlebooks-chi-sim-all-1gram-20120701-v
googlebooks-chi-sim-all-1gram-20120701-v.filtered
googlebooks-chi-sim-all-1gram-20120701-w
googlebooks-chi-sim-all-1gram-20120701-w.filtered
googlebooks-chi-sim-all-1gram-20120701-x
googlebooks-chi-sim-all-1gram-20120701-x.filtered
googlebooks-chi-sim-all-1gram-20120701-y
googlebooks-chi-sim-all-1gram-20120701-y.filtered
googlebooks-chi-sim-all-1gram-20120701-z
googlebooks-chi-sim-all-1gram-20120701-z.filtered

googlebooks-chi-sim-all-1gram-20120701 download and filter is finished!


%%bash

# parameters
n=1 # ngram
l=chi-sim # language
ignore_case=True # Converts the ngrams into lowercase and consolidates the raw counts of any two identical ngram.
restriction=True # Removes ngrams with a zero raw counts on any year.
annotation=False #  Removes the part-of-speech annotation (the '\_NOUN' part of the ngram string) of an ngram and consolidates the raw counts of any two identical ngram.

# normalize
./normalize.ngram.py $n $l $ignore_case $restriction $annotation

normalization complete for googlebooks-chi-sim-all-1gram-20120701.filtered.ITrueRTrueAFalse.all.pkl


import matplotlib.font_manager as mfm
import matplotlib.pyplot as plt
import googleNgram as gn
import pandas as pd
import numpy as np
fontP = mfm.FontProperties(fname='unifont.ttf',size=12)


# LOAD DATASET: English with consolidated annotations
n = '1'
l = 'chi-sim'
D = gn.read(n,l,ignore_case=True,restriction=True,annotation=False)
year = D['rscore'].columns
print('Language: '+l)
print('Vocabulary Words: '+str(D['rscore'].shape[0]))
print('Years: '+str(D['rscore'].shape[1]))
print('DataFrame keys: '+str(D.keys()))

Language: chi-sim
Vocabulary Words: 180
Years: 109
DataFrame keys: dict_keys(['rscore', 'pscore', 'zscore', 'pos'])


# EXAMPLE: List of four example 1grams
chosen_1grams = ['八','不得','部','本']


# PRINT: Part-of-speech annotation vectors
print('1gram, part-of-speech annotations')
print('---------------------------------')
for i in chosen_1grams:
    print(i+', '+str(D['pos'][i]))

1gram, part-of-speech annotations
---------------------------------
八, ['NOUN', '', 'NUM']
不得, ['VERB', '', 'ADV']
部, ['NOUN', 'NUM', '']
本, ['NOUN', 'NUM', 'DET', '']


# PLOT: Time-series plots of four 1grams
fig, ax = plt.subplots(1,3,figsize=(25,5))

# rscore plot
for i, j in enumerate(chosen_1grams):
    ax[0].plot(year,D['rscore'].T[j],label=chosen_1grams[i],linewidth=2)
ax[0].set_xlabel('year')
ax[0].set_ylabel('rscore')
ax[0].set_title(l+' raw scores')
ax[0].legend(loc=2,prop=fontP)

# pscore plot
for i, j in enumerate(chosen_1grams):
    ax[1].plot(year,D['pscore'].T[j],label=chosen_1grams[i],linewidth=2)
ax[1].set_xlabel('year')
ax[1].set_ylabel('pscore')
ax[1].set_title(l+' probability scores')
ax[1].legend(loc=2,prop=fontP)

# zscore plot
for i, j in enumerate(chosen_1grams):
    ax[2].plot(year,D['zscore'].T[j],label=chosen_1grams[i],linewidth=2)
ax[2].set_xlabel('year')
ax[2].set_ylabel('zscore')
ax[2].set_title(l+' normalized scores')
ax[2].legend(loc=2,prop=fontP)
plt.show()


# LOAD DATASET: Simplified-Chinese with annotations
n = '1'
l = 'chi-sim'
Da = gn.read(n,l,ignore_case=True,restriction=True,annotation=True)
year = D['rscore'].columns
print('Language: '+l)
print('Vocabulary Words: '+str(D['rscore'].shape[0]))
print('Years: '+str(D['rscore'].shape[1]))
print('DataFrame keys: '+str(D.keys()))

Language: chi-sim
Vocabulary Words: 180
Years: 109
DataFrame keys: dict_keys(['rscore', 'pscore', 'zscore', 'pos'])


# EXAMPLE: List of four example 1grams
chosen_1grams_a = ['八_NOUN','八_NUM','不得_VERB','不得_ADV']


# PRINT: Part-of-speech annotation vectors
print('1gram, part-of-speech annotations')
print('---------------------------------')
for i in chosen_1grams_a:
    print(i+', '+str(Da['pos'][i]))

1gram, part-of-speech annotations
---------------------------------
八_NOUN, NOUN
八_NUM, NUM
不得_VERB, VERB
不得_ADV, ADV


# PLOT: Time-series plots of four 1grams
fig, ax = plt.subplots(1,3,figsize=(25,5))

# rscore plot
for i, j in enumerate(chosen_1grams):
    ax[0].plot(year,Da['rscore'].T[j],label=chosen_1grams_a[i],linewidth=2)
ax[0].set_xlabel('year')
ax[0].set_ylabel('rscore')
ax[0].set_title(l+' raw scores (w/ part-of-speech annotation)')
ax[0].legend(loc=2,prop=fontP)

# pscore plot
for i, j in enumerate(chosen_1grams):
    ax[1].plot(year,Da['pscore'].T[j],label=chosen_1grams_a[i],linewidth=2)
ax[1].set_xlabel('year')
ax[1].set_ylabel('pscore')
ax[1].set_title(l+' probability scores (w/ part-of-speech annotation)')
ax[1].legend(loc=2,prop=fontP)

# zscore plot
for i, j in enumerate(chosen_1grams):
    ax[2].plot(year,Da['zscore'].T[j],label=chosen_1grams_a[i],linewidth=2)
ax[2].set_xlabel('year')
ax[2].set_ylabel('zscore')
ax[2].set_title(l+' normalized scores (w/ part-of-speech annotation)')
ax[2].legend(loc=2,prop=fontP)
plt.show()


n = '1'
l = 'chi-sim'
D = gn.read(n,l,ignore_case=True,restriction=True,annotation=False)
R = D['rscore']
with open('1gram-list/stopwords/'+l,encoding='utf8') as file:
    stopword_list = set(file.read().split('\n'))
R_subset = R.loc[stopword_list.intersection(R.index)]
R_subset.head(5) # prints only 5 rows

part-of-speech	annotation
Noun	NOUN
Verb	VERB
Adjective	ADJ
Adverb	ADV
pronoun	PRON
determiner and article	DET
preposition and postposition	ADP
numeral	NUM
conjunction	CONJ
particle	PRT
other	O
none

	1900	1901	1902	1903	1904	1905	1906	1907	1908	1909	...	1999	2000	2001	2002	2003	2004	2005	2006	2007	2008
个	6707.0	73.0	52.0	568.0	162.0	10.0	164.0	41.0	125.0	42.0	...	5563848.0	5574929.0	5274389.0	5471106.0	5245910.0	5638150.0	6992614.0	5919769.0	4249559.0	3863737.0
使	3053.0	58.0	63.0	390.0	44.0	4.0	34.0	34.0	46.0	16.0	...	2188522.0	2255829.0	2059094.0	2052291.0	1917389.0	1972694.0	2318012.0	1949005.0	1314665.0	1193675.0
所	7296.0	136.0	129.0	468.0	188.0	61.0	165.0	149.0	289.0	88.0	...	3288280.0	3579527.0	3210061.0	3407651.0	3322054.0	3652206.0	4475949.0	3954025.0	2584698.0	2385177.0
一	32338.0	5059.0	5974.0	9763.0	8273.0	3066.0	3105.0	2406.0	9211.0	1825.0	...	15372138.0	16176331.0	14510888.0	15361638.0	14394143.0	15335049.0	18472385.0	16428392.0	11553811.0	10763052.0
及	2996.0	62.0	49.0	173.0	145.0	43.0	507.0	63.0	95.0	74.0	...	2016035.0	2307563.0	2030683.0	2121696.0	2030797.0	2257675.0	2783649.0	2475682.0	1715111.0	1485641.0

Download Raw Files of the Google Ngram Data¶

Alex John Quijano¶

1. Introduction.¶

2. Download and Process Raw Data.¶

2.1. Downloading and Filtering Raw data (with specified parameters).¶

2.2. Normalizing the Filtered Data (with specified parameters).¶

2.3. Downloading for $n > 1$.¶

3. Loading the Processed Data using the googleNgram Python module.¶

3.1. Loading and Visualizing the Dataset Examples.¶

3.1.1. Dataset with Consolidated Annotations.¶

3.1.2. Dataset with Annotations.¶

3.2. Subsetting the Data According to a word list from a File.¶

References¶