A summary with code samples of the set operations that come with STL: union, intersection, symmetrical difference and set difference.
For consistency, the two sets of integer vectors used in each example are the same and are:
vals1 = { 1, 2, 4 }
vals2 = { 1, 2, 5, 7 };
Unions
The union of two sets is formed by the elements that are present in either one of the sets, or in both.
#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
typedef std::vector<int>::iterator iter;
template<typename T, typename InputIterator>
void Print(std::ostream& ostr,
InputIterator itbegin,
InputIterator itend,
const std::string& delimiter)
{
std::copy(itbegin,
itend,
std::ostream_iterator<T>(ostr, delimiter.c_str()));
}
int main()
{
// Initialise sample data sets and vectors
int vals1[] = { 1, 2, 4 };
int vals2[] = { 1, 2, 5, 7 };
int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
std::vector<int> v1( vals1, vals1 + size1 );
std::vector<int> v2( vals2, vals2 + size2 );
std::vector<int> v( size1 + size2 );
std::vector<int>::iterator it_union;
std::vector<int>::iterator it;
// Sort the vectors (essential)
std::sort( v1.begin(), v1.end() );
std::sort( v2.begin(), v2.end() );
// Find the union of the two vector sets: elements present in either
// of the sets or both
it_union = set_union ( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
int union_size = int( it_union - v.begin() );
std::cout << "Union size = " << union_size << std::endl;
// Print the union
Print<int, iter>( std::cout, v.begin(), it_union, " " );
return 0;
}
Output:
Union size = 5
1 2 4 5 7
Intersections
Intersections consist of elements present in both sets at the same time.
#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
typedef std::vector<int>::iterator iter;
template<typename T, typename InputIterator>
void Print(std::ostream& ostr,
InputIterator itbegin,
InputIterator itend,
const std::string& delimiter)
{
std::copy(itbegin,
itend,
std::ostream_iterator<T>(ostr, delimiter.c_str()));
}
int main()
{
// Initialise sample data sets and vectors
int vals1[] = { 1, 2, 4 };
int vals2[] = { 1, 2, 5, 7 };
int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
std::vector<int> v1( vals1, vals1 + size1 );
std::vector<int> v2( vals2, vals2 + size2 );
std::vector<int> v( size1 + size2 );
std::vector<int>::iterator it_intersect;
std::vector<int>::iterator it;
// Sort the vectors (essential)
std::sort( v1.begin(), v1.end() );
std::sort( v2.begin(), v2.end() );
// Find the intersection of the two vector sets: elements present in both sets
it_intersect = set_intersection( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
int intersect_size = int( it_intersect - v.begin() );
std::cout << "Intersection size = " << intersect_size << std::endl;
// Print the intersection
Print<int, iter>( std::cout, v.begin(), it_intersect, " " );
return 0;
}
Output:
Intersection size = 2
1 2
Symmetric Differences
The set of elements that are present in one of the sets, but not in the other.
#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
typedef std::vector<int>::iterator iter;
template<typename T, typename InputIterator>
void Print(std::ostream& ostr,
InputIterator itbegin,
InputIterator itend,
const std::string& delimiter)
{
std::copy(itbegin,
itend,
std::ostream_iterator<T>(ostr, delimiter.c_str()));
}
int main()
{
// Initialise sample data sets and vectors
int vals1[] = { 1, 2, 4 };
int vals2[] = { 1, 2, 5, 7 };
int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
std::vector<int> v1( vals1, vals1 + size1 );
std::vector<int> v2( vals2, vals2 + size2 );
std::vector<int> v( size1 + size2 );
std::vector<int>::iterator it_symm_diff;
std::vector<int>::iterator it;
// Sort the vectors (essential)
std::sort( v1.begin(), v1.end() );
std::sort( v2.begin(), v2.end() );
// The symmetric difference of two sets is formed by the elements that are present in one
// of the sets, but not in the other.
it_symm_diff = set_symmetric_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
int symm_diff_size = int( it_symm_diff - v.begin() );
std::cout << "Symmetric difference size = " << symm_diff_size << std::endl;
// Print the symmetric difference
Print<int, iter>( std::cout, v.begin(), it_symm_diff, " " );
return 0;
}
Output:
Symmetric difference size = 3
4 5 7
Set Differences
The elements that present in the first set, but not in the second one.
#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
typedef std::vector<int>::iterator iter;
template<typename T, typename InputIterator>
void Print(std::ostream& ostr,
InputIterator itbegin,
InputIterator itend,
const std::string& delimiter)
{
std::copy(itbegin,
itend,
std::ostream_iterator<T>(ostr, delimiter.c_str()));
}
int main()
{
// Initialise sample data sets and vectors
int vals1[] = { 1, 2, 4 };
int vals2[] = { 1, 2, 5, 7 };
int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
std::vector<int> v1( vals1, vals1 + size1 );
std::vector<int> v2( vals2, vals2 + size2 );
std::vector<int> v( size1 + size2 );
std::vector<int>::iterator it_set_diff;
std::vector<int>::iterator it;
// Sort the vectors (essential)
std::sort( v1.begin(), v1.end() );
std::sort( v2.begin(), v2.end() );
// The set difference of two sets is formed by the elements that are present in the first
// sets, but not in the second.
it_set_diff = set_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
int set_diff_size = int( it_set_diff - v.begin() );
std::cout << "Set difference size = " << set_diff_size << std::endl;
// Print the set difference
Print<int, iter>( std::cout, v.begin(), it_set_diff, " " );
return 0;
}
Output:
Set difference size = 1
4
Practical example: using union / set intersection to calculate the Jaccard Index
The Jaccard index is a means of measuring the similarity and/or diversity of sample sets:
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <string>
class Token
{
private:
std::string str;
public:
Token() {}
Token( std::string val ) : str( val ) {}
std::string Word() const
{ return str; }
};
struct Ascending
{
bool operator() ( Token& start, Token& end )
{
return start.Word() < end.Word();
}
};
int main()
{
std::string str1[] = { "The", "crazy", "cat", "sat", "on", "the", "mat" };
std::string str2[] = { "The", "cat", "sat", "on", "the", "red", "mat" };
std::vector<Token>::iterator tok_intersect, tok_union;
int size1 = sizeof( str1 ) / sizeof( str1[ 0 ] );
int size2 = sizeof( str2 ) / sizeof( str2[ 0 ] );
std::vector<Token> tokens1( str1, str1 + size1 );
std::vector<Token> tokens2( str2, str2 + size2 );
std::vector<Token> tokens( size1 + size2 );
std::sort( tokens1.begin(), tokens1.end(), Ascending() );
std::sort( tokens2.begin(), tokens2.end(), Ascending() );
tok_intersect = std::set_intersection( tokens1.begin(),
tokens1.end(),
tokens2.begin(),
tokens2.end(),
tokens.begin(),
Ascending() );
int intersect_size = int( tok_intersect - tokens.begin() );
tok_union = std::set_union ( tokens1.begin(),
tokens1.end(),
tokens2.begin(),
tokens2.end(),
tokens.begin(),
Ascending() );
int union_size = int( tok_union - tokens.begin() );
double JaccardIndex = (double) intersect_size / (double) union_size;
return 0;
}
In this example, the size of the union set is 8, since there are 8 words in either one of the sets, or in both: “The”, “crazy”, “cat”, “sat”, “on”, “the”, “red”, “mat”. The intersection size is 6 since there are 6 words present in both sets at the same time: “The”, “cat”, “sat”, “on”, “the”, “mat”. Thus the Jaccard Index is calculated as 6 / 8 = 0.75.