Defining unique strings of a two-dimensional array (vector <vector <T>>)
I am using a data type std::vector<std::vector<T> >to store a 2D matrix / array. I would like to define unique rows of this matrix. I am looking for any advice or recommendations on how to do this.
I tried two methods.
Method 1: slightly folded. I save the index for each row with 0/1, indicating whether the row is a duplicate value and works through the matrix, storing the index of each unique row in deque. I want to store the results in <vector<vector<T> >, and therefore, from this index index, I first select and then assign rows from the matrix to the return value.
Method 2: It’s easier to read and in many cases faster than method 1. I keep a deck of unique lines that were found, and just look through the lines and compare each line with all the elements in it deque.
I compare both of these methods with matlab, and these C ++ routines are several orders of magnitude slower. Does anyone have any clever ideas on how I can speed up this operation? I am looking for this operation for matrices that can have millions of rows.
I store the unique lines in deque during the loop to avoid the cost of resizing the vector, and then copied dequein vector<vector<T> >for the results. I checked this operation closely, and it does not slow down anywhere, it makes less than 0.5% of the execution time on a matrix with 100,000 rows, for example.
Thank,
Bean
. - , , , - .
1:
template <typename T>
void uniqueRows( const std::vector<std::vector<T> > &A,
std::vector<std::vector<T> > &ret) {
// Go through a vector<vector<T> > and find the unique rows
// have a value ind for each row that is 1/0 indicating if a value
// has been previously searched.
// cur : current item being compared to every item
// num : number of values searched for. Once all the values in the
// matrix have been searched, terminate.
size_t N = A.size();
size_t num=1,cur=0,it=1;
std::vector<unsigned char> ind(N,0);
std::deque<size_t> ulist; // create a deque to store the unique inds
ind[cur] = 1;
ulist.push_back(0); // ret.push_back(A[0]);
while(num < N ) {
if(it >= N ) {
++cur; // find next non-duplicate value, push back
while(ind[cur])
++cur;
ulist.push_back(cur); //ret.push_back(A[cur]);
++num;
it = cur+1; // start search for duplicates at the next row
if(it >= N && num == N)
break;
}
if(!ind[it] && A[cur]==A[it]) {
ind[it] = 1; // mark as duplicate
++num;
}
++it;
} // ~while num
// loop over the deque and .push_back the unique vectors
std::deque<size_t>::iterator iter;
const std::deque<size_t>::iterator end = ulist.end();
ret.reserve(ulist.size());
for(iter= ulist.begin(); iter != end; ++iter) {
ret.push_back(A[*iter]);
}
}
2:
template <typename T>
inline bool isInList(const std::deque< std::vector<T> > &A,
const std::vector<T> &b) {
typename std::deque<std::vector<T> >::const_iterator it;
const typename std::deque<std::vector<T> >::const_iterator end = A.end();
for(it = A.begin(); it != end; ++it) {
if(*it == b)
return true;
}
return false;
}
template <typename T>
void uniqueRows1(const::std::vector<std::vector<T> > &A,
std::vector<std::vector<T> > &ret) {
typename std::deque<std::vector<T> > ulist;
typename std::vector<std::vector<T> >::const_iterator it = A.begin();
const typename std::vector<std::vector<T> >::const_iterator end = A.end();
ulist.push_back(*it);
for(++it; it != end; ++it) {
if(!isInList(ulist,*it)) {
ulist.push_back(*it);
}
}
ret.reserve(ulist.size());
for(size_t i = 0; i != ulist.size(); ++i) {
ret.push_back(ulist[i]);
}
}
, ( O(m*n), , O(2*m*n), ), sort/unique - ( , , Billy, .)
, Boost.Unordered, :
#include <vector>
#include <boost/foreach.hpp>
#include <boost/ref.hpp>
#include <boost/typeof/typeof.hpp>
#include <boost/unordered_set.hpp>
namespace boost {
template< typename T >
size_t hash_value(const boost::reference_wrapper< T >& v) {
return boost::hash_value(v.get());
}
template< typename T >
bool operator==(const boost::reference_wrapper< T >& lhs, const boost::reference_wrapper< T >& rhs) {
return lhs.get() == rhs.get();
}
}
// destructive, but fast if the original copy is no longer required
template <typename T>
void uniqueRows_inplace(std::vector<std::vector<T> >& A)
{
boost::unordered_set< boost::reference_wrapper< std::vector< T > const > > unique(A.size());
for (BOOST_AUTO(it, A.begin()); it != A.end(); ) {
if (unique.insert(boost::cref(*it)).second) {
++it;
} else {
A.erase(it);
}
}
}
// returning a copy (extra copying cost)
template <typename T>
void uniqueRows_copy(const std::vector<std::vector<T> > &A,
std::vector< std::vector< T > > &ret)
{
ret.reserve(A.size());
boost::unordered_set< boost::reference_wrapper< std::vector< T > const > > unique;
BOOST_FOREACH(const std::vector< T >& row, A) {
if (unique.insert(boost::cref(row)).second) {
ret.push_back(row);
}
}
}
EDIT: std::vector operator< operator==, :
template <typename t>
std::vector<std::vector<t> > GetUniqueRows(std::vector<std::vector<t> > input)
{
std::sort(input.begin(), input.end());
input.erase(std::unique(input.begin(), input.end()), input.end());
return input;
}
std::unique , std::equal .
std::unique , . , std::lexicographical_compare . , - . M * n log n ( M - , n - ), std::unique m*n.
, : m * n ^ 2 time.
EDIT: :
template <typename t>
struct VectorEqual : std::binary_function<const std::vector<t>&, const std::vector<t>&, bool>
{
bool operator()(const std::vector<t>& lhs, const std::vector<t>& rhs)
{
if (lhs.size() != rhs.size()) return false;
return std::equal(lhs.first(), lhs.second(), rhs.first());
}
};
template <typename t>
struct VectorLess : std::binary_function<const std::vector<t>&, const std::vector<t>&, bool>
{
bool operator()(const std::vector<t>& lhs, const std::vector<t>& rhs)
{
return std::lexicographical_compare(lhs.first(), lhs.second(), rhs.first(), rhs.second());
}
};
template <typename t>
std::vector<std::vector<t> > GetUniqueRows(std::vector<std::vector<t> > input)
{
std::sort(input.begin(), input.end(), VectorLess<t>());
input.erase(std::unique(input.begin(), input.end(), VectorEqual<t>()), input.end());
return input;
}