Here's a more general solution, that can find and compare URL's in text files containing not just URL's:
#!/bin/sh # diffl.sh # DIFF with Links - a "diff utility"-like .sh script # (dash, bash, zsh compatible) that can find missing # web links in one file compared to a group of files # Please note that: for simplicity, in this script, only # URLs containing "://" are taken into consideration, # although there can be URLs that do not contain it # (such as mailto:[email protected]) GetOS () { OS_kernel_name=$(uname -s) case "$OS_kernel_name" in "Linux") eval $1="Linux" ;; "Darwin") eval $1="Mac" ;; "CYGWIN"*|"MSYS"*|"MINGW"*) eval $1="Windows" ;; "") eval $1="unknown" ;; *) eval $1="other" ;; esac } DetectShell () { eval $1=\"\"; if [ -n "$BASH_VERSION" ]; then eval $1=\"bash\"; elif [ -n "$ZSH_VERSION" ]; then eval $1=\"zsh\"; elif [ "$PS1" = '$ ' ]; then eval $1=\"dash\"; else eval $1=\"undetermined\"; fi } PrintInTitle () { printf "\033]0;%s\007" "$1" } PrintJustInTitle () { PrintInTitle "$1">/dev/tty } trap1 () { CleanUp printf "\nAborted.\n">/dev/tty } CleanUp () { #Restore "INTERRUPT" (CTRL-C) and "TERMINAL STOP" (CTRL-Z) signals: trap - INT trap - TSTP #Clear the title: PrintJustInTitle "" #Restore initial IFS: #IFS=$old_IFS unset IFS } DisplayHelp () { printf "\n" printf "diffl - DIFF by URL web Links\n" printf "\n" printf " What it does:\n" printf " - compares the URL web links in the two provided files (<file1> and <file2>) and shows the missing web links that are found in one but not in the other\n" printf " Syntax:\n" printf " <caller_shell> '/path/to/diffl.sh' <file1> <file2> ... <fileN> [flags]\n" printf " - where:\n" printf " - <caller_shell> can be any of the shells: dash, bash, zsh, or any other shell compatible with the \"dash\" shell syntax\n" printf " - '/path/to/diffl.sh' represents the path of this script\n" printf " - <file1> and <file2> represent the directory trees to be compared\n" printf " - if more than two files are provided as parameters (<file1>, <file2>, ..., <fileN>): the web links in <file1> are compared with all the web links in <file2>, ... <fileN>\n" printf " - [flags] can be:\n" printf " --help or -h\n" printf " Displays this help information\n" printf " Output:\n" printf " - lines starting with '<' signify web links from <file1>\n" printf " - lines starting with '>' signify web links from <file2>, ..., <fileN>\n" printf " Notes:\n" printf " - for simplicity, in this script, only URLs containing \"://\" are taken into consideration, although there can be URLs that do not contain it (such as mailto:[email protected])\n" printf "\n" } GetOS OS ################################################################################# ## Uncomment the next line if your OS is not Linux or Mac (and eventually ## ## modify the commands used (sed, sort, uniq) according to your system): ## ################################################################################# #OS="userdefined" DetectShell current_shell if [ "$current_shell" = "undetermined" ]; then printf "\nWarning: This script was designed to work with dash, bash and zsh shells.\n\n">/dev/tty fi #Get the program parameters into the array "params": params_count=0 for i; do params_count=$((params_count+1)) eval params_$params_count=\"\$i\" done params_0=$((params_count)) if [ "$params_0" = "0" ]; then #if no parameters are provided: display help DisplayHelp CleanUp && exit 0 fi #Create a flags array. A flag denotes special parameters: help_flag="0" i=1; j=0; while [ "$i" -le "$((params_0))" ]; do eval params_i=\"\$\{params_$i\}\" case "${params_i}" in "--help" | "-h" ) help_flag="1" ;; * ) j=$((j+1)) eval selected_params_$j=\"\$params_i\" ;; esac i=$((i+1)) done selected_params_0=$j #Rebuild params array: for i in $(seq 1 $selected_params_0); do eval params_$i=\"\$\{selected_params_$i\}\" done params_0=$selected_params_0 if [ "$help_flag" = "1" ]; then DisplayHelp else #Run program: NL=$(printf '%s' "\n\n"); #final NewLine is deleted #or use: #NL=$'\n' error1="false" error2="false" error3="false" { sed --help >/dev/null 2>/dev/null; } || { error1="true"; } { sort --help >/dev/null 2>/dev/null; } || { error2="true"; } { uniq --help >/dev/null 2>/dev/null; } || { error3="true"; } if [ "$error1" = "true" -o "$error2" = "true" -o "$error3" = "true" ]; then { printf "\n" if [ "$error1" = "true" ]; then printf '%s' "ERROR: Could not run \"sed\" (necessary in order for this script to function correctly)!"; fi if [ "$error2" = "true" ]; then printf '%s' "ERROR: Could not run \"sort\" (necessary in order for this script to function correctly)"; fi if [ "$error3" = "true" ]; then printf '%s' "ERROR: Could not run \"uniq\" (necessary in order for this script to function correctly)"; fi printf "\n" }>/dev/stderr exit fi if [ "$OS" = "Linux" -o "$OS" = "Mac" -o "$OS" = "userdefined" ]; then # command1: sed -E 's/([a-zA-Z]*\:\/\/)/\\${NL}\1/g' sed_command1='sed -E '"'"'s/([a-zA-Z]*\:\/\/)/'"\\${NL}"'\1/g'"'"; # command2: sed -n 's/\(\(.*\([^a-zA-Z+]\)\|\([a-zA-Z]\)\)\)\(\([a-zA-Z]\)*\:\/\/\)\([^ \t]*\).*/\4\5\7/p' sed_command2='sed -n '"'"'s/\(\(.*\([^a-zA-Z+]\)\|\([a-zA-Z]\)\)\)\(\([a-zA-Z]\)*\:\/\/\)\([^ \t]*\).*/\4\5\7/p'"'" # command3: sed -E 's/(.) [0-9]* (.*)/\1 \2/g' sed_command3='sed -E '"'"'s/(.) [0-9]* (.*)/\1 \2/g'"'"; # command4: sed -E 's/^1/>/g;s/^0/</g' sed_command4='sed -E '"'"'s/^1/>/g;s/^0/</g'"'" else printf '\n%s\n\n' "Error: Unsupported OS!">/dev/stderr exit 1 fi #Get the program parameters into the array "files": count=0 for i; do count=$((count+1)) eval files_$count=\"\$i\" done files_0=$((count)) error="false" if [ "$files_0" -lt "2" ]; then printf '\n%s\n' "ERROR: Please provide at least two parameters!">/dev/stderr error="true" fi if [ "$error" = "true" ]; then printf "\n" exit 1 fi error="false" for i in $(seq 1 $files_0); do eval current_file=\"\$files_$i\" if [ ! \( -e "$current_file" -a -f "$current_file" \) ]; then printf '\n%s\n' "ERROR: File \"$current_file\" does not exist or is not a regular file!">/dev/stderr error="true" fi done if [ "$error" = "true" ]; then printf "\n" exit 1 fi #Proceed to finding and comparing links: #Trap "INTERRUPT" (CTRL-C) and "TERMINAL STOP" (CTRL-Z) signals: trap 'trap1' INT trap 'trap1' TSTP old_IFS="$IFS" #Store initial IFS value IFS=" " { PrintJustInTitle "Searching for links [1]..." mask="00000000000000000000" { count=0 for link in $(\ cat "$files_1" |\ eval $sed_command1 |\ eval $sed_command2\ ); do count_prev=$count count=$((count+1)) if [ "${#count_prev}" -lt "${#count}" ]; then mask="${mask%?}" fi number="$mask$count" printf '%s\n' "0 $number $link" PrintJustInTitle "Links found [1]: $((count))..." done; PrintJustInTitle "Sorting results [1]..." }|sort -u -k 3 PrintJustInTitle "Searching for links [2]..." mask="00000000000000000000" { count=0 for i in $(seq 2 $files_0); do eval current_file=\"\$files_$i\" for link in $(\ cat "$current_file" |\ eval $sed_command1 |\ eval $sed_command2\ ); do count_prev=$count count=$((count+1)) if [ "${#count_prev}" -lt "${#count}" ]; then mask="${mask%?}" fi number="$mask$count" printf '%s\n' "1 $number $link" PrintJustInTitle "Links found [2]: $((count))..." done done PrintJustInTitle "Sorting results [2]..." }|sort -u -k 3 PrintJustInTitle "Searching for unique links [3]..." }|{\ sort -k 3|uniq -u -f 2|sort|eval $sed_command3|eval $sed_command4 PrintJustInTitle "Done"; } CleanUp fi
- Syntax:
<caller_shell> '/path/to/diffl.sh' <file1> <file2> ... <fileN>
- What it does:
- this will show the URL web links that
<file1> and the group of files <file2>, ..., <fileN> don't have in common
- Notes:
- for simplicity, in this script, only URLs containing "
://" are taken into consideration