STL Set Operations

A summary with code samples of the set operations that come with STL: union, intersection, symmetrical difference and set difference.

For consistency, the two sets of integer vectors used in each example are the same and are:

vals1 = { 1, 2, 4 }
vals2 = { 1, 2, 5, 7 };

Unions

The union of two sets is formed by the elements that are present in either one of the sets, or in both.

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>

typedef std::vector<int>::iterator iter; 

template<typename T, typename InputIterator>
void Print(std::ostream& ostr,   
           InputIterator itbegin,   
           InputIterator itend,   
           const std::string& delimiter)  
{  
    std::copy(itbegin,   
              itend,   
              std::ostream_iterator<T>(ostr, delimiter.c_str()));  
}  

int main()
{
    // Initialise sample data sets and vectors
    int vals1[] = { 1, 2, 4 };
    int vals2[] = { 1, 2, 5, 7 };
    int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
    int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
    
    std::vector<int> v1( vals1, vals1 + size1 );
    std::vector<int> v2( vals2, vals2 + size2 );
    std::vector<int> v( size1 + size2 );
    std::vector<int>::iterator it_union;
    std::vector<int>::iterator it;

    // Sort the vectors (essential)
    std::sort( v1.begin(), v1.end() );
    std::sort( v2.begin(), v2.end() );

    // Find the union of the two vector sets: elements present in either
    // of the sets or both
    it_union = set_union ( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
    int union_size = int( it_union - v.begin() );
    std::cout << "Union size = " << union_size << std::endl;

    // Print the union
    Print<int, iter>( std::cout, v.begin(), it_union, " " );

    return 0;
}

Output:

Union size = 5
1 2 4 5 7

Intersections

Intersections consist of elements present in both sets at the same time.

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>

typedef std::vector<int>::iterator iter; 

template<typename T, typename InputIterator>
void Print(std::ostream& ostr,   
           InputIterator itbegin,   
           InputIterator itend,   
           const std::string& delimiter)  
{  
    std::copy(itbegin,   
              itend,   
              std::ostream_iterator<T>(ostr, delimiter.c_str()));  
}  

int main()
{
    // Initialise sample data sets and vectors
    int vals1[] = { 1, 2, 4 };
    int vals2[] = { 1, 2, 5, 7 };
    int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
    int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
    
    std::vector<int> v1( vals1, vals1 + size1 );
    std::vector<int> v2( vals2, vals2 + size2 );
    std::vector<int> v( size1 + size2 );
    std::vector<int>::iterator it_intersect;
    std::vector<int>::iterator it;

    // Sort the vectors (essential)
    std::sort( v1.begin(), v1.end() );
    std::sort( v2.begin(), v2.end() );

    // Find the intersection of the two vector sets: elements present in both sets
    it_intersect = set_intersection( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
    int intersect_size = int( it_intersect - v.begin() );
    std::cout << "Intersection size = " << intersect_size << std::endl;

    // Print the intersection
    Print<int, iter>( std::cout, v.begin(), it_intersect, " " );

    return 0;
}

Output:

Intersection size = 2
1 2

Symmetric Differences

The set of elements that are present in one of the sets, but not in the other.

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>

typedef std::vector<int>::iterator iter; 

template<typename T, typename InputIterator>
void Print(std::ostream& ostr,   
           InputIterator itbegin,   
           InputIterator itend,   
           const std::string& delimiter)  
{  
    std::copy(itbegin,   
              itend,   
              std::ostream_iterator<T>(ostr, delimiter.c_str()));  
}  

int main()
{
    // Initialise sample data sets and vectors
    int vals1[] = { 1, 2, 4 };
    int vals2[] = { 1, 2, 5, 7 };
    int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
    int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
    
    std::vector<int> v1( vals1, vals1 + size1 );
    std::vector<int> v2( vals2, vals2 + size2 );
    std::vector<int> v( size1 + size2 );
    std::vector<int>::iterator it_symm_diff;
    std::vector<int>::iterator it;

    // Sort the vectors (essential)
    std::sort( v1.begin(), v1.end() );
    std::sort( v2.begin(), v2.end() );

    // The symmetric difference of two sets is formed by the elements that are present in one 
    // of the sets, but not in the other.
    it_symm_diff = set_symmetric_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
    int symm_diff_size = int( it_symm_diff - v.begin() );
    std::cout << "Symmetric difference size = " << symm_diff_size << std::endl;

    // Print the symmetric difference
    Print<int, iter>( std::cout, v.begin(), it_symm_diff, " " );

    return 0;
}

Output:

Symmetric difference size = 3
4 5 7

Set Differences

The elements that present in the first set, but not in the second one.

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>

typedef std::vector<int>::iterator iter; 

template<typename T, typename InputIterator>  
void Print(std::ostream& ostr,   
           InputIterator itbegin,   
           InputIterator itend,   
           const std::string& delimiter)  
{  
    std::copy(itbegin,   
              itend,   
              std::ostream_iterator<T>(ostr, delimiter.c_str()));  
}  

int main()
{
    // Initialise sample data sets and vectors
    int vals1[] = { 1, 2, 4 };
    int vals2[] = { 1, 2, 5, 7 };
    int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] );
    int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] );
    
    std::vector<int> v1( vals1, vals1 + size1 );
    std::vector<int> v2( vals2, vals2 + size2 );
    std::vector<int> v( size1 + size2 );
    std::vector<int>::iterator it_set_diff;
    std::vector<int>::iterator it;

    // Sort the vectors (essential)
    std::sort( v1.begin(), v1.end() );
    std::sort( v2.begin(), v2.end() );

    // The set difference of two sets is formed by the elements that are present in the first 
    // sets, but not in the second.
    it_set_diff = set_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin());
    int set_diff_size = int( it_set_diff - v.begin() );
    std::cout << "Set difference size = " << set_diff_size << std::endl;

    // Print the set difference
    Print<int, iter>( std::cout, v.begin(), it_set_diff, " " );

    return 0;
}

Output:

Set difference size = 1
4

Practical example: using union / set intersection to calculate the Jaccard Index

The Jaccard index is a means of measuring the similarity and/or diversity of sample sets:

#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <string>


class Token
{
private:
	std::string str;

public:
	Token() {}
	Token( std::string val ) : str( val ) {}
	std::string Word() const
	{ return str; }
};

struct Ascending  
{  
      bool operator() ( Token& start, Token& end )  
      {  
		  return start.Word() < end.Word();  
      }  
};  


int main()
{
	std::string str1[] = { "The", "crazy", "cat", "sat", "on",  "the", "mat" };
	std::string str2[] = { "The", "cat",   "sat", "on",  "the", "red", "mat" };
	
	std::vector<Token>::iterator tok_intersect, tok_union;

	int size1 = sizeof( str1 ) / sizeof( str1[ 0 ] );
	int size2 = sizeof( str2 ) / sizeof( str2[ 0 ] );
	std::vector<Token> tokens1( str1, str1 + size1 );
	std::vector<Token> tokens2( str2, str2 + size2 );
	
	std::vector<Token> tokens( size1 + size2 ); 

	std::sort( tokens1.begin(), tokens1.end(), Ascending() );  
	std::sort( tokens2.begin(), tokens2.end(), Ascending() );  

	tok_intersect = std::set_intersection( tokens1.begin(), 
										   tokens1.end(), 
										   tokens2.begin(), 
										   tokens2.end(), 
										   tokens.begin(),
										   Ascending() );

	int intersect_size = int( tok_intersect - tokens.begin() );  

	tok_union = std::set_union ( tokens1.begin(), 
		                         tokens1.end(), 
								 tokens2.begin(), 
								 tokens2.end(), 
								 tokens.begin(),
								 Ascending() );  

    int union_size = int( tok_union - tokens.begin() );  

	double JaccardIndex = (double) intersect_size / (double) union_size;

	return 0;
}

In this example, the size of the union set is 8, since there are 8 words in either one of the sets, or in both: “The”, “crazy”, “cat”, “sat”, “on”, “the”, “red”, “mat”. The intersection size is 6 since there are 6 words present in both sets at the same time: “The”, “cat”, “sat”, “on”, “the”, “mat”. Thus the Jaccard Index is calculated as 6 / 8 = 0.75.

Leave a Reply