xfinity_cm_data_scraper.php

<?php
/**
 * Xfinity Cable Modem Stats Gatherer.
 *
 * This will gather stats about your Xfinity cable modem, at least for model CGM4140COM.
 * It will output its data in Influx line format for ingestion into Influx.
 *
 * You'll need to install php-cli, php-curl, php-xml
 */

// If debugging locally, set to true.
define( 'DEBUG', false );

if ( defined( 'DEBUG' ) && DEBUG ) {
	ini_set( 'display_errors', 1 );
	ini_set( 'display_startup_errors', 1 );
	error_reporting( E_ALL );
}

/**
 * Xfinity Modem Stats Class
 */
class Xfinity_Modem_Stats {
	/**
	 * Modem URL
	 *
	 * @var string
	 */
	public static $modem_url = 'http://10.0.0.1';

	/**
	 * Login Page
	 *
	 * @var string
	 */
	public static $login_page = '/check.jst';

	/**
	 * Admin username
	 *
	 * @var string
	 */
	public static $username = 'admin';

	/**
	 * Admin password
	 *
	 * @var string
	 */
	public static $password = 'hunter2';

	/**
	 * Stats Data page
	 *
	 * @var string
	 */
	public static $data_page = '/network_setup.jst';

	/**
	 * Tagpass Tag
	 *
	 * @var string
	 */
	public static $tag = 'xfinity_modem';

	/**
	 * Grabs an authentication cookie.
	 *
	 * @return string Auth Cookie value
	 */
	public static function authenticate() {
		$login_cookie = Cache::get( 'login_cookie' );

		if ( ! $login_cookie ) {
			$ch = curl_init( self::$modem_url . self::$login_page );

			$data = sprintf(
				'username=%s&password=%s&locale=false',
				self::$username,
				self::$password
			);

			curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
			curl_setopt( $ch, CURLOPT_POST, true );
			curl_setopt( $ch, CURLOPT_POSTFIELDS, $data );
			curl_setopt( $ch, CURLOPT_HEADER, true ); // Get headers in the response.

			$response = curl_exec( $ch );

			// Extract cookie from response.
			preg_match( '/Set-Cookie: DUKSID=(.*?);/', $response, $matches );
			$login_cookie = $matches[1];

			Cache::set( 'login_cookie', $login_cookie, 'default', 60 * 5 ); // 5 minutes.
		}

		return $login_cookie;
	}

	/**
	 * Gets modem stats HTML page.
	 *
	 * @param  boolean $login_cookie Auth cookie value.
	 *
	 * @return string HTML page for stats
	 */
	public static function get_modem_stats_html( $login_cookie = false ) {
		if ( ! $login_cookie ) {
			return false;
		}

		$html = Cache::get( 'modem_stats_html' );

		if ( ! $html ) {
			$ch = curl_init( self::$modem_url . self::$data_page );
			curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
			curl_setopt( $ch, CURLOPT_HTTPHEADER, array( sprintf( 'Cookie: DUKSID=%s', $login_cookie ) ) );

			$html = curl_exec( $ch );
			curl_close( $ch );

			$ttl = defined( 'DEBUG' ) ? 5 : 60;

			Cache::set( 'modem_stats_html', $html, 'default', $ttl );
		}

		if ( str_contains( $html, '<script type="text/javascript">alertLoc("Please Login First!"); location.href="home_loggedout.jst";</script>' ) ) {
			Cache::delete( 'login_cookie' );
			die( 'Login Unsuccesful!' );
		}

		return $html;
	}

	/**
	 * Parse the HTML and get the CM Channel stats
	 *
	 * @param  boolean $html Stats Page HTML.
	 *
	 * @return string JSON containing channel stats.
	 */
	public static function parse_cm_stats( $html = false ) {
		if ( false === $html ) {
			return false;
		}

		// Step 3: Parse the HTML and extract the table contents.
		$doc = new DOMDocument();
		@$doc->loadHTML( $html ); // phpcs:ignore Squiz.Commenting.FunctionCommentThrowTag.Missing

		// Get the table that has "CM Error Codewords" in the thead.
		$tables       = $doc->getElementsByTagName( 'table' );
		$target_table = null;

		foreach ( $tables as $table ) {
			if ( ! str_contains( $table->nodeValue, 'CM Error Codewords' ) ) {
				continue;
			}

			$target_table = $table;
		}

		if ( ! $target_table ) {
			Cache::delete( 'modem_stats_html' );
			die( 'Target table not found.' );
		}

		// Extract data from target table.
		$headers = array();
		foreach ( $target_table->getElementsByTagName( 'th' ) as $header ) {
			$headers[] = trim( $header->nodeValue );
		}

		$rows = $target_table->getElementsByTagName( 'tr' );
		$data = array();

		foreach ( $rows as $row_index => $row ) {
			// We don't need the header again.
			if ( str_contains( $row->nodeValue, 'CM Error Codewords' ) ) {
				continue;
			}

			foreach ( $row->getElementsByTagName( 'td' ) as $cell_index => $cell ) {
				switch ( $row_index ) {
					case 1: // Channel ID.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					case 2: // Unerrored Codewords.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					case 3: // Correctable Codewords.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					case 4: // Uncorrectable Codewords.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					default:
						if ( defined( 'DEBUG' ) && DEBUG ) {
							var_dump( $row );
						}
						throw new Exception( 'Unexpected Table Row Found.' );
				}
			}
		}

		// Step 4: Convert the filtered table content to JSON.
		$json_result = json_encode( $data, JSON_PRETTY_PRINT );
		return $json_result;
	}

	/**
	 * Parse the HTML and get the Downstream Bonding stats
	 *
	 * @param  boolean $html Stats Page HTML.
	 *
	 * @return string JSON containing channel stats.
	 */
	public static function parse_downstream_bonding( $html = false ) {
		if ( false === $html ) {
			return false;
		}

		// Step 3: Parse the HTML and extract the table contents.
		$doc = new DOMDocument();
		@$doc->loadHTML( $html ); // phpcs:ignore Squiz.Commenting.FunctionCommentThrowTag.Missing

		// Get the table that has "CM Error Codewords" in the thead.
		$tables       = $doc->getElementsByTagName( 'table' );
		$target_table = null;

		foreach ( $tables as $table ) {
			if ( ! str_contains( $table->nodeValue, 'Channel Bonding Value' ) ) {
				continue;
			}

			if ( ! str_contains( $table->nodeValue, 'Downstream' ) ) {
				continue;
			}

			$target_table = $table;
		}

		if ( ! $target_table ) {
			Cache::delete( 'modem_stats_html' );
			die( 'Target table not found.' );
		}

		// Extract data from target table.
		$headers = array();
		foreach ( $target_table->getElementsByTagName( 'th' ) as $header ) {
			$headers[] = trim( $header->nodeValue );
		}

		$rows = $target_table->getElementsByTagName( 'tr' );
		$data = array();

		foreach ( $rows as $row_index => $row ) {
			// We don't need the header again.
			if ( str_contains( $row->nodeValue, 'Channel Bonding Value' ) ) {
				continue;
			}

			foreach ( $row->getElementsByTagName( 'td' ) as $cell_index => $cell ) {
				switch ( $row_index ) {
					case 1: // Channel ID.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					case 2: // Lock Status.
						// This is a string, so may not be compatible with Influx. Uncomment if you want it anyway.
						// $data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					case 3: // Frequency.
						if ( trim( str_replace( 'MHz', '', $cell->nodeValue ) ) > 1000000 ) {
							// Convert Hz to MHz.
							$data[ $headers[ $row_index - 1 ] ][] = trim( str_replace( 'MHz', '', $cell->nodeValue ) ) / 1000000;
						} else {
							$data[ $headers[ $row_index - 1 ] ][] = (int) trim( str_replace( 'MHz', '', $cell->nodeValue ) );
						}
						break;
					case 4: // SNR.
						$data[ $headers[ $row_index - 1 ] ][] = trim( str_replace( 'dB', '', $cell->nodeValue ) );
						break;
					case 5: // Power Level.
						$data[ $headers[ $row_index - 1 ] ][] = trim( str_replace( 'dBmV', '', $cell->nodeValue ) );
						break;
					case 6: // Modulation.
						// This is a string, so may not be compatible with Influx. Uncomment if you want it anyway.
						// $data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					default:
						if ( defined( 'DEBUG' ) && DEBUG ) {
							var_dump( $row );
						}
						throw new Exception( 'Unexpected Table Row Found.' );
				}
			}
		}

		// Step 4: Convert the filtered table content to JSON.
		$json_result = json_encode( $data, JSON_PRETTY_PRINT );
		return $json_result;
	}
	/**
	 * Parse the HTML and get the Upstream Bonding stats
	 *
	 * @param  boolean $html Stats Page HTML.
	 *
	 * @return string JSON containing channel stats.
	 */
	public static function parse_upstream_bonding( $html = false ) {
		if ( false === $html ) {
			return false;
		}

		// Step 3: Parse the HTML and extract the table contents.
		$doc = new DOMDocument();
		@$doc->loadHTML( $html ); // phpcs:ignore Squiz.Commenting.FunctionCommentThrowTag.Missing

		// Get the table that has "CM Error Codewords" in the thead.
		$tables       = $doc->getElementsByTagName( 'table' );
		$target_table = null;

		foreach ( $tables as $table ) {
			if ( ! str_contains( $table->nodeValue, 'Channel Bonding Value' ) ) {
				continue;
			}

			if ( ! str_contains( $table->nodeValue, 'Upstream' ) ) {
				continue;
			}

			$target_table = $table;
		}

		if ( ! $target_table ) {
			Cache::delete( 'modem_stats_html' );
			die( 'Target table not found.' );
		}

		// Extract data from target table.
		$headers = array();
		foreach ( $target_table->getElementsByTagName( 'th' ) as $header ) {
			$headers[] = trim( $header->nodeValue );
		}

		$rows = $target_table->getElementsByTagName( 'tr' );
		$data = array();

		foreach ( $rows as $row_index => $row ) {
			// We don't need the header again.
			if ( str_contains( $row->nodeValue, 'Channel Bonding Value' ) ) {
				continue;
			}

			foreach ( $row->getElementsByTagName( 'td' ) as $cell_index => $cell ) {
				switch ( $row_index ) {
					case 1: // Channel ID.
						$data[ $headers[ $row_index - 1 ] ][] = (int) $cell->nodeValue;
						break;
					case 2: // Lock Status.
						// This is a string, so may not be compatible with Influx. Uncomment if you want it anyway.
						// $data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					case 3: // Frequency.
						if ( trim( str_replace( 'MHz', '', $cell->nodeValue ) ) > 1000000 ) {
							// Convert Hz to MHz.
							$data[ $headers[ $row_index - 1 ] ][] = trim( str_replace( 'MHz', '', $cell->nodeValue ) ) / 1000000;
						} else {
							$data[ $headers[ $row_index - 1 ] ][] = (int) trim( str_replace( 'MHz', '', $cell->nodeValue ) );
						}
						break;
					case 4: // Symbol Rate.
						$data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					case 5: // Power Level.
						$data[ $headers[ $row_index - 1 ] ][] = trim( str_replace( 'dBmV', '', $cell->nodeValue ) );
						break;
					case 6: // Modulation.
						// This is a string, so may not be compatible with Influx. Uncomment if you want it anyway.
						// $data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					case 7: // Channel Type.
						// This is a string, so may not be compatible with Influx. Uncomment if you want it anyway.
						// $data[ $headers[ $row_index - 1 ] ][] = trim( $cell->nodeValue );
						break;
					default:
						if ( defined( 'DEBUG' ) && DEBUG ) {
							var_dump( $row );
						}
						throw new Exception( 'Unexpected Table Row Found.' );
				}
			}
		}

		// Step 4: Convert the filtered table content to JSON.
		$json_result = json_encode( $data, JSON_PRETTY_PRINT );
		return $json_result;
	}

	/**
	 * Sanitize a header to be compatible with Influx.
	 *
	 * Removes non-alphanumeric characters and replaces spaces with underscores.
	 *
	 * @param  string $str Header to sanitize.
	 *
	 * @return string      Sanitized header.
	 */
	public static function sanitize_header( $str ) {
		$str = strtolower( $str ); // Convert string to lowercase
		$str = preg_replace( '/[^a-z0-9]+/', '_', $str ); // Remove non-alphanumeric characters and replace spaces with underscores
		$str = trim( $str, '_' ); // Trim underscores from the beginning and end of the string

		return $str;
	}

	/**
	 * Converts JSON object to Influx Line format.
	 *
	 * @param  string $json  Cable Modem JSON data.
	 * @param  string $group Group for data.
	 *
	 * @return string       Line format data.
	 */
	public static function json_to_influx_line( $json, $group ) {
		$data = json_decode( $json, true );

		if ( ! $data ) {
			throw new Exception( 'Invalid JSON provided.' );
		}

		// Arrays for storing different sets of codewords.
		$channel_ids = $data['Channel ID'] ?? array();

		$line_data = array();

		foreach ( $data as $field => $values ) {
			if ( 'Channel ID' === $field ) {
				continue;
			}
			foreach ( $values as $value ) {
				$line_data[] = sprintf( '%s=%s', self::sanitize_header( $field ), $value );
			}
		}

		$lines = array();
		foreach ( $channel_ids as $index => $channel_id ) {
			$line_data = array();
			foreach ( $data as $field => $values ) {
				if ( 'Channel ID' === $field ) {
					continue;
				}
				$line_data[] = sprintf( '%s=%s', self::sanitize_header( $field ), $data[ $field ][ $index ] );
			}

			$line_data = implode( ',', $line_data );

			$line    = sprintf(
				'%s,channel_id=%d,source=%s %s',
				$group,
				$channel_id,
				self::$tag,
				$line_data
			);
			$lines[] = $line;
		}

		return implode( "\n", $lines );
	}
}

/**
 * Caching Class
 */
class Cache {
	/**
	 * Returns the base cache directory.
	 *
	 * @return string The cache directory.
	 */
	private static function get_cache_dir() {
		return sys_get_temp_dir() . '/_php_custom_cache/';
	}

	/**
	 * Returns the cache group directory.
	 *
	 * @param string $group The cache group.
	 * @return string The cache group directory.
	 */
	private static function get_group_dir( $group = 'default' ) {
		return static::get_cache_dir() . static::sanitize( $group ) . '/';
	}

	/**
	 * Returns the cache filename for a given key and group.
	 *
	 * @param string $key   The cache key.
	 * @param string $group The cache group.
	 * @return string The cache filename.
	 */
	private static function get_cache_filename( $key, $group = 'default' ) {
		return static::get_group_dir( $group ) . static::sanitize( $key ) . '.cache';
	}

	/**
	 * Sanitizes names for safe use as file and directory names.
	 *
	 * @param string $name The name to sanitize.
	 * @return string The sanitized name.
	 */
	private static function sanitize( $name ) {
		return preg_replace( '/[^A-Za-z0-9\_\-]/', '', $name );
	}

	/**
	 * Retrieves the cache for the given key and group.
	 *
	 * @param string $key   The cache key.
	 * @param string $group The cache group.
	 * @param bool   $force Whether to force an update of the local cache from the persistent cache.
	 * @param bool   $found Whether the key was found in the cache. Disambiguates a return of false, a storable value.
	 * @return bool|mixed False on failure to retrieve cache or the cache's stored value.
	 */
	public static function get( $key, $group = 'default', $force = false, &$found = null ) {
		$filename = static::get_cache_filename( $key, $group );
		if ( file_exists( $filename ) ) {
			$found = true;
			$data  = unserialize( file_get_contents( $filename ) );
			if ( $data['expire'] > time() || 0 == $data['expire'] ) {
				return $data['value'];
			} else {
				unlink( $filename );  // Remove expired cache file.
				$found = false;
				return false;
			}
		} else {
			$found = false;
			return false;
		}
	}

	/**
	 * Sets or updates the cache for the given key and group.
	 *
	 * @param string $key   The cache key.
	 * @param mixed  $data  The data to store.
	 * @param string $group The cache group.
	 * @param int    $expire When the cache data should expire, in seconds.
	 * @return bool True on successful set, false on failure.
	 */
	public static function set( $key, $data, $group = 'default', $expire = 0 ) {
		$dir = static::get_group_dir( $group );
		if ( ! is_dir( $dir ) ) {
			mkdir( $dir, 0777, true );  // Create group directory if it does not exist.
		}
		$filename = static::get_cache_filename( $key, $group );
		$payload  = array(
			'value'  => $data,
			'expire' => ( time() + $expire ),
		);
		file_put_contents( $filename, serialize( $payload ) );
		return true;
	}

	/**
	 * Adds a cache for the given key and group, if it does not already exist.
	 *
	 * @param string $key   The cache key.
	 * @param mixed  $data  The data to store.
	 * @param string $group The cache group.
	 * @param int    $expire When the cache data should expire, in seconds.
	 * @return bool True on successful add, false on failure.
	 */
	public static function add( $key, $data, $group = 'default', $expire = 0 ) {
		if ( static::get( $key, $group ) === false ) {
			return static::set( $key, $data, $group, $expire );
		} else {
			return false;
		}
	}

	/**
	 * Deletes the cache for the given key and group.
	 *
	 * @param string $key   The cache key.
	 * @param string $group The cache group.
	 * @return bool True on successful delete, false on failure.
	 */
	public static function delete( $key, $group = 'default' ) {
		$filename = static::get_cache_filename( $key, $group );
		if ( file_exists( $filename ) ) {
			unlink( $filename );
			return true;
		} else {
			return false;
		}
	}

	/**
	 * Deletes all cache in the given group.
	 *
	 * @param string $group The cache group.
	 * @return bool True on successful delete, false on failure.
	 */
	public static function delete_group( $group = 'default' ) {
		$dir = static::get_group_dir( $group );
		if ( is_dir( $dir ) ) {
			$files = glob( $dir . '*' );  // Get all files in the directory.
			foreach ( $files as $file ) {
				if ( is_file( $file ) ) {
					unlink( $file );  // Delete each file.
				}
			}
			rmdir( $dir );  // Remove the directory.
			return true;
		} else {
			return false;
		}
	}
}

$login_cookie = Xfinity_Modem_Stats::authenticate();
$html         = Xfinity_Modem_Stats::get_modem_stats_html( $login_cookie );
$cm_json      = Xfinity_Modem_Stats::parse_cm_stats( $html );
$db_json      = Xfinity_Modem_Stats::parse_downstream_bonding( $html );
$ub_json      = Xfinity_Modem_Stats::parse_upstream_bonding( $html );
echo Xfinity_Modem_Stats::json_to_influx_line( $cm_json, 'codewords' ) . PHP_EOL;
echo Xfinity_Modem_Stats::json_to_influx_line( $db_json, 'downstream_bonding' ) . PHP_EOL;
echo Xfinity_Modem_Stats::json_to_influx_line( $ub_json, 'upstream_bonding' ) . PHP_EOL;

Other Posts Not Worth Reading

Hey, You!

Like this kind of garbage? Subscribe for more! I post like once a month or so, unless I found something interesting to write about.