How to import data from mongodb to pandas?

Question

How to import data from mongodb to pandas?

I have a large amount of data in a collection in mongodb that I need to parse. How to import this data into pandas?

I am new to pandas and numpy.

EDIT: The mongodb collection contains sensor values marked with date and time. Sensor values are of the floating point data type.

Sample data:

{ "_cls" : "SensorReport", "_id" : ObjectId("515a963b78f6a035d9fa531b"), "_types" : [ "SensorReport" ], "Readings" : [ { "a" : 0.958069536790466, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:26:35.297Z"), "b" : 6.296118156595, "_cls" : "Reading" }, { "a" : 0.95574014778624, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:27:09.963Z"), "b" : 6.29651468650064, "_cls" : "Reading" }, { "a" : 0.953648289182713, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:27:37.545Z"), "b" : 7.29679823731148, "_cls" : "Reading" }, { "a" : 0.955931884300997, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:28:21.369Z"), "b" : 6.29642922525632, "_cls" : "Reading" }, { "a" : 0.95821381, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:41:20.801Z"), "b" : 7.28956613, "_cls" : "Reading" }, { "a" : 4.95821335, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:41:36.931Z"), "b" : 6.28956574, "_cls" : "Reading" }, { "a" : 9.95821341, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:42:09.971Z"), "b" : 0.28956488, "_cls" : "Reading" }, { "a" : 1.95667927, "_types" : [ "Reading" ], "ReadingUpdatedDate" : ISODate("2013-04-02T08:43:55.463Z"), "b" : 0.29115237, "_cls" : "Reading" } ], "latestReportTime" : ISODate("2013-04-02T08:43:55.463Z"), "sensorName" : "56847890-0", "reportCount" : 8 }

+51

python pandas mongodb pymongo

Nithin Apr 27 '13 at 7:59

source share

10 answers

Monary does just that, and it is very fast. ( another link )

Watch this cool post , which includes a quick tutorial and some timings.

+16

shx2 Dec 19 '13 at 22:33

source share

You can load your mongodb data into pandas DataFrame with this code. It works for me. Hope you too.

 import pymongo import pandas as pd from pymongo import MongoClient client = MongoClient() db = client.database_name collection = db.collection_name data = pd.DataFrame(list(collection.find()))

+15

saimadhu.polamuri Dec 23 '14 at 9:15

source share

 import pandas as pd from odo import odo data = odo('mongodb://localhost/db::collection', pd.DataFrame)

+7

fengwt Oct 20 '16 at 23:33

source share

According to PEP, simple is better than complex:

 import pandas as pd df = pd.DataFrame.from_records(db.<database_name>.<collection_name>.find())

You can include conditions that will work with a regular mongoDB database, or even use find_one () to get only one item from the database, etc.

and voila!

+6

Cy Bu Oct 23 '16 at 11:43

source share

For efficient use of data (not related to RAM) (for example, in parallel execution), you can try the Python Blaze ecosystem : Blaze / Dask / Odo.

Blaze (and Odo ) has ready-made functions for working with MongoDB.

A few helpful articles to get you started:

Blaze Expessions view (with MongoDB query example)
ReproduceIt: Reddit Word Count
Difference between Dask and Blaze Arrays

And an article that shows what amazing things are possible on the Blaze stack: An analysis of 1.7 billion Reddit comments from Blaze and Impala (essentially a 975 Gb Reddit request in seconds).

PS I am not associated with any of these technologies.

+3

Dennis Golomazov Sep 27 '16 at 0:16

source share

http://docs.mongodb.org/manual/reference/mongoexport

export to csv and use read_csv or JSON and use DataFrame.from_records

+2

Jeff Apr 27 '13 at 11:32

source share

Using

 pandas.DataFrame(list(...))

will consume a lot of memory if the result of the iterator / generator is large

better to generate small pieces and concat at the end

 def iterator2dataframes(iterator, chunk_size: int): """Turn an iterator into multiple small pandas.DataFrame This is a balance between memory and efficiency """ records = [] frames = [] for i, record in enumerate(iterator): records.append(record) if i % chunk_size == chunk_size - 1: frames.append(pd.DataFrame(records)) records = [] if records: frames.append(pd.DataFrame(records)) return pd.concat(frames)

+1

Deo Leung Sep 12 '16 at 8:19 on

source share

Following this wonderful waitingkuo answer, I would like to add the ability to do this using chunksize in accordance with . read_sql () and . read_csv () . I am increasing the response from Deu Leung , avoiding going one by one each "record" of the "iterator" / "cursor". I will take the previous read_mongo function.

 def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, chunksize = 100, no_id=True): """ Read from Mongo and Store into DataFrame """ # Connect to MongoDB #db = _connect_mongo(host=host, port=port, username=username, password=password, db=db) client = MongoClient(host=host, port=port) # Make a query to the specific DB and Collection db_aux = client[db] # Some variables to create the chunks skips_variable = range(0, db_aux[collection].find(query).count(), int(chunksize)) if len(skips_variable)<=1: skips_variable = [0,len(skips_variable)] # Iteration to create the dataframe in chunks. for i in range(1,len(skips_variable)): # Expand the cursor and construct the DataFrame #df_aux =pd.DataFrame(list(cursor_aux[skips_variable[i-1]:skips_variable[i]])) df_aux =pd.DataFrame(list(db_aux[collection].find(query)[skips_variable[i-1]:skips_variable[i]])) if no_id: del df_aux['_id'] # Concatenate the chunks into a unique df if 'df' not in locals(): df = df_aux else: df = pd.concat([df, df_aux], ignore_index=True) return df

0

Rafael Valero Mar 06 '18 at 10:43

source share

A similar approach, such as Rafael Valero, Expectant and Deu Leung using pagination :

 def read_mongo( # db, collection, query=None, # host='localhost', port=27017, username=None, password=None, chunksize = 100, page_num=1, no_id=True): # Connect to MongoDB db = _connect_mongo(host=host, port=port, username=username, password=password, db=db) # Calculate number of documents to skip skips = chunksize * (page_num - 1) # Sorry, this is in spanish # https://www.toptal.com/python/c%C3%B3digo-buggy-python-los-10-errores-m%C3%A1s-comunes-que-cometen-los-desarrolladores-python/es if not query: query = {} # Make a query to the specific DB and Collection cursor = db[collection].find(query).skip(skips).limit(chunksize) # Expand the cursor and construct the DataFrame df = pd.DataFrame(list(cursor)) # Delete the _id if no_id: del df['_id'] return df

0

Jordy Cuan Mar 20 '18 at 1:19

source share

waitingkuo · Accepted Answer · 2013-04-27 18:45

pymongo can give you a hand, the following codes that I use:

 import pandas as pd from pymongo import MongoClient def _connect_mongo(host, port, username, password, db): """ A util for making a connection to mongo """ if username and password: mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db) conn = MongoClient(mongo_uri) else: conn = MongoClient(host, port) return conn[db] def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True): """ Read from Mongo and Store into DataFrame """ # Connect to MongoDB db = _connect_mongo(host=host, port=port, username=username, password=password, db=db) # Make a query to the specific DB and Collection cursor = db[collection].find(query) # Expand the cursor and construct the DataFrame df = pd.DataFrame(list(cursor)) # Delete the _id if no_id: del df['_id'] return df

How to import data from mongodb to pandas?

More articles: