:
Pandas , , (. )
1. pandas:
df = pd.DataFrame({"det":["a","the","a","a","a", "the"], "word":["cat", "pet", "pet", "cat","pet", "pet"]})
"you will need a dummy variable:"
df["counts"] = 1
"you probably need to reset the index"
df_counts = df.groupby(["det","word"]).agg("count").reset_index()
"and pivot it"
df_counts.pivot( index = "word", columns = "det", values="counts").fillna(0)
:
df = pd.DataFrame([['idee', 'het', 'lel', 1],
['idee', 'het', 'lel', 1],
['idee', 'de', 'lal', 1],
['functie', 'de', 'lal', 1],
['functie', 'de', 'lal', 1],
['functie', 'en', 'lil', 1],
['functie', 'de', 'lel', 1],
['functie', 'de', 'lel', 1]],
columns = ['node', 'precedingWord', 'comp', 'counts'])
df["counts"] = 1
df_counts = df.groupby(["node","precedingWord", "comp"]).agg("count").reset_index()
df_counts
2. Counter
df = pd.DataFrame({"det":["a","the","a","a","a", "a"], "word":["cat", "pet", "pet", "cat","pet", "pet"]})
acounter = Counter( (tuple(x) for x in df.as_matrix()) )
df_counts = pd.DataFrame(list(zip([y[0] for y in acounter.keys()], [y[1] for y in acounter.keys()], acounter.values())), columns=["det", "word", "counts"])
df_counts.pivot( index = "word", columns = "det", values="counts").fillna(0)
, pandas (52,6 92,9 , )
3. , . CountVectorizer sklearn ngram_range=(1, 2). - :
df = pd.DataFrame({"det":["a","the","a","a","a", "a"], "word":["cat", "pet", "pet", "cat","pet", "pet"]})
from sklearn.feature_extraction.text import CountVectorizer
listofpairs = []
for _, row in df.iterrows():
listofpairs.append(" ".join(row))
countvect = CountVectorizer(ngram_range=(2,2), min_df = 0.0, token_pattern='(?u)\\b\\w+\\b')
sparse_counts = countvect.fit_transform(listofpairs)
print("* input list:\n",listofpairs)
print("* array of counts:\n",sparse_counts.toarray())
print("* vocabulary [order of columns in the sparse array]:\n",countvect.vocabulary_)
counter_keys = [x[1:] for x in sorted([ tuple([v] + k.split(" ")) for k,v in countvect.vocabulary_.items()])]
counter_values = np.sum(sparse_counts.toarray(), 0)
df_counts = pd.DataFrame([(x[0], x[1], y) for x,y in zip(counter_keys, counter_values)], columns=["det", "word", "counts"])
:
1. concat df1.set_index ( "" ) df2.set_index ( "" ) dfout = pd.concat([df1, df2], = 1)
2. merge
loc
( ) row,column . / ( ).
, in, :
df.loc[df.precedingWord.isin(neuter), "gender"] = "neuter"
indices_neutral = df["precedingWord"]=="de"
df.loc[indices, "gender"] = "neuter"
,
df.loc[df["precedingWord"]=="de", "gender"] = "neuter"