<?php



// Statistically Improbable Phrases

/*
# Example

error_reporting( E_ALL | E_STRICT );
ini_set( 'display_errors', 1 );

$article = <<<EOL
txt here
EOL;
$sip = new WPI_SIP();
$sip->load_stops('../txt/stopwords.txt');
$kws = $sip->phrases($article);
echo '<h2>$sip->phrases</h2><pre>' . print_r($kws, true);
echo '<h2>$sip->times</h2><pre>' . print_r($sip->times, true);
echo '<h2>$sip->words</h2><pre>' . print_r($sip->words, true);
echo '<h2>$sip->phrases</h2><pre>' . print_r($sip->phrases, true);

// best list
$best = array_merge( $sip->phrases, array_slice($sip->words, 0, 5) );
echo '<h2>$best</h2><pre>' . print_r($best, true);
*/



class WPI_SIP {
	

	public $stop_list = array();
	public $stops;
	public $words;
	public $phrases;
	public $times = array( 'start'=>0, 'finish'=>0, 'taken'=>0 );
	
	
	public function __construct() {
	}


	public function load_stops($path)
	{
		if( file_exists($path) )
			$this->stop_list = explode("\n", trim( str_replace("\r", '', file_get_contents($path) ) ) );
	}
	
	// delivers a usable array of pretty tags
	// This returns all the phrases found, and the require num of single words too
	public function get_tags($content, $single_words=5)
	{

		// kick start the tag generation
		$this->phrases($content);

		// extract the best tags
		$tags = array_merge( $this->phrases, array_slice($this->words, 0, $single_words) );

		// return a pretty array
		return array_keys($tags);

	}

	public function phrases($str, $num=3) {
		
		$this->times['start'] = microtime(true);

		$str = strip_tags($str);
		$str = preg_replace('#https?://[^ "\'><]{4,500}#i', '', $str);

		$this->words = $this->kws($str, true);
		$this->phrases = $this->two_grams($str);
		
		// if there's no phrases found, just use the first word.
		if( count($this->phrases) < 1 )
			return array_slice( $this->words, 0, 3 );
		
		// otherwise return a phrase that contins the best match of the words.
		else {
			$inter = $this->unique_intersection($this->phrases, $this->words, $num);
			$this->times['finish'] = microtime(true);
			$this->times['taken'] = $this->times['finish'] - $this->times['start'];
			return $inter;
		}
		
	}

	
	public function unique_intersection($phrases, $words, $num=3) {
		
		$ranked = array();
		
		// put words in nice array
		$new_words = array();
		foreach($words as $w => $f)
			$new_words[] = $w;
		
		// loop phrases and rank according to how many words they contain
		foreach($phrases as $phrase => $freq) {
			
			$score = 0;
			
			$bits = explode(' ', $phrase);

			if( in_array($bits[0], $new_words) )
				$score = $score + $words[$bits[0]];

			if( in_array($bits[1], $new_words) )
				$score = $score + $words[$bits[1]];
			
			if( $score >= 2 )
				$ranked[ $phrase ] = $score;
			
		}
		
		// sort and get top ones, shuffled
		arsort($ranked);
		$chosen = array_values( array_flip( array_slice($ranked, 0, $num) ) );
		return $chosen;
		
	}


	public function kws( $str, $onlyfreq=false, $min_freq=2, $num=5 ) {
		
		// clean the string (leave in punc so they can be removed as stop words.
		$str = strtolower( preg_replace('#[^0-9a-z ]+#si', ' ', $str) );
		
		// remove anything 3 chars or less
		$str = preg_replace('#\b[a-z0-9]{1,3}\b#si', ' ', $str);
		
		// remove any strings that are just numbers
		$str = preg_replace('#\b[0-9]+\b#si', ' ', $str);
		
		// kill all stops
		$new_str = " $str ";
		foreach($this->stop_list as $stop)
			$new_str = " " . str_replace(" $stop ", ' ', $new_str) . " ";
		
		// replace multi spaces with hash
		$new_str = trim( preg_replace('#\s{2,}#si', ' ', $new_str) );
		
		// snap and order by freq
		$bits = explode(" ", $new_str);
		$freq = array_count_values($bits);
		arsort($freq);
		
		// if there's only 1 word bits, return false
		reset($freq);
		if( $onlyfreq )
			return $freq;
		
		// return a MAX of $num provided the're all more than 1 occurance
		reset($freq);
		$i=0;
		$kws = array();
		foreach($freq as $kw => $f) {

			if( $f >= $min_freq )
				$kws[] = $kw;

			++$i;

			if($i >= $num )
				break;

		}
		
		return $kws;
		
	}


	public function two_grams( $str, $min_freq=2 ) {
		
		// clean the string
		$str = strtolower( preg_replace('#[^0-9a-z ]+#si', ' ', $str) );
		
		// clean excess spaces
		$str = trim( preg_replace('#\s+#si', ' ', $str) );
		
		// explode & get 2 word strings into array
		$bits = explode(" ", $str);
		$chunks = array_chunk($bits, 2);
		
		// make another explosion in an "odd" fashion, so we don't miss any 2 word phrases accross the chunking border.
		$odd_bits = $bits;
		array_shift($odd_bits);
		$odd_chunks = array_chunk($odd_bits, 2);
	
		// kill all stops for normal chunks
		$uniques = array();
		foreach($chunks as $chunk) {
			
			if( empty($chunk[0]) || empty($chunk[1]) )
				continue;
			
			$string = @$chunk[0].@$chunk[1];
			
			if(
				!in_array($chunk[0], $this->stop_list) &&
				!in_array($chunk[1], $this->stop_list) &&
				!is_numeric($chunk[0]) &&
				!is_numeric($chunk[1]) &&
				strlen($string) > 4
			) {
				$uniques[] = $chunk[0] . ' ' . $chunk[1];
			}
			
		}
		
		// kill all stops for odd chunks
		foreach($odd_chunks as $chunk) {
			
			if( empty($chunk[0]) || empty($chunk[1]) )
				continue;
			
			$string = @$chunk[0].@$chunk[1];
			
			if(
				!in_array($chunk[0], $this->stop_list) &&
				!in_array($chunk[1], $this->stop_list) &&
				!is_numeric($chunk[0]) &&
				!is_numeric($chunk[1]) &&
				strlen($string) > 4
			) {
				$uniques[] = $chunk[0] . ' ' . $chunk[1];
			}
			
		}
		
		$freq = array_count_values($uniques);
		arsort($freq);
		
		// now only return things with 2 or more score
		$finals = array();
		foreach($freq as $kw => $val)
			if( $val >= $min_freq )
				$finals[$kw] = $val;
		
		return $finals;

	}
	
	
}





