3. Dezember 2024

Shell Script:
Plesk Webserver-Logs automatisiert prüfen und zusammenfassen

Mit folgendem Shell Script lassen sich die Apache Webserver-Logs aller Domains in Plesk automatisch prüfen und zusammenfassen. Dadurch kann man einen Überblick über Traffic und eventuelle Probleme bekommen, auch bei vielen Domains und großen Log-Files.

#!/bin/bash

# Define output files
output_file="/var/www/vhosts/_analyze_logs/traffic-pages.txt"
output_wordpress="/var/www/vhosts/_analyze_logs/traffic-wordpress.txt"
output_5xx="/var/www/vhosts/_analyze_logs/traffic-5xx.txt"
output_404="/var/www/vhosts/_analyze_logs/traffic-404.txt"
output_bots="/var/www/vhosts/_analyze_logs/traffic-bots.txt"
output_ips="/var/www/vhosts/_analyze_logs/traffic-ips.txt"

# Define temporary files
temp_file="/tmp/combined_traffic.txt"
temp_wordpress="/tmp/combined_wordpress.txt"
temp_5xx="/tmp/combined_5xx.txt"
temp_404="/tmp/combined_404.txt"
temp_bots="/tmp/combined_bots.txt"

# Default values
hours_back=24
num_entries=30
domain=""
mail_results=""
bots_min_count=500
ips_min_count=1000
log_file="access_ssl_log"

# Check if arguments are provided for hours back, number of entries, specific domain, bots_min_count, and log_file
if [ $# -gt 0 ]; then
    hours_back=$1
fi

if [ $# -gt 1 ]; then
    num_entries=$2
fi

if [ $# -gt 2 ]; then
    domain=$3
fi

if [ $# -gt 3 ]; then
    mail_results=$4
fi

if [ $# -gt 4 ]; then
    bots_min_count=$5
fi

if [ $# -gt 4 ]; then
    ips_min_count=$6
fi

if [ $# -gt 5 ]; then
    log_file=$6
fi

# Start timing the script
start_time=$(date +%s%3N)

# Calculate the date and time threshold
date_threshold=$(date -d "-$hours_back hours" +"%d/%b/%Y:%H:%M:%S")

# Initialize the output files
echo "Top $num_entries Most Viewed Pages Across All Logs (Last $hours_back hours)" > "$output_file"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_file"
echo "" >> "$output_file"

echo "Top $num_entries WordPress Related Traffic Across All Logs (Last $hours_back hours)" > "$output_wordpress"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_wordpress"
echo "" >> "$output_wordpress"

echo "Top $num_entries 5xx Errors Across All Logs (Last $hours_back hours)" > "$output_5xx"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_5xx"
echo "" >> "$output_5xx"

echo "Top $num_entries 404 Errors Across All Logs (Last $hours_back hours)" > "$output_404"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_404"
echo "" >> "$output_404"

echo "Bot Traffic >= $bots_min_count Across All Logs (Last $hours_back hours)" > "$output_bots"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_bots"
echo "" >> "$output_bots"

echo "Top $num_entries IP Addresses Across All Logs (Last $hours_back hours)" > "$output_ips"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_ips"
echo "" >> "$output_ips"

# Process each log and combine results into temporary files
> "$temp_file"
> "$temp_wordpress"
> "$temp_5xx"
> "$temp_404"

# Determine the log files to process
if [ -n "$domain" ]; then
    logs_to_process=(/var/www/vhosts/$domain/logs/$log_file)
else
    logs_to_process=(/var/www/vhosts/*/logs/$log_file)
fi

# Define pattern for static files
static_files_pattern='\.(css|js|jpg|jpeg|png|gif|webp|ico|svg|woff|woff2|ttf|eot|pdf|zip|map|mp4|webm|mp3|wav|doc|docx|xls|xlsx|rar|tar|gz)(\?.*)?$'

# Define pattern for normal traffic
normal_traffic_pattern="$static_files_pattern|\/wp-content\/|\/wp-admin\/|\/wp-json\/|koko-analytics|wp-cron\.php|\/wp-includes\/|wc-ajax=|\/favicons\/|\/xmlrpc\.php|\/feed\/|robots\.txt|sitemap|wp-login\.php"

# Define pattern for WordPress traffic
wordpress_traffic_pattern="wp-cron\.php|\/wp-includes\/|wc-ajax=|\/xmlrpc\.php|wp-login\.php|\/wp-json\/|\/wp-admin\/|\/login\/|\/wp-content\/themes\/|\/wp-content\/plugins\/|\/feed\/|\/wp-comments-post\.php|\/trackback\/"

# Define bot search patterns
bot_pattern_include="bot|spider|crawler|slurp|bing|yandex|baidu|AdsBot-Google|Googlebot|Applebot|BingSapphire|Plesk screenshot bot|bingbot|Bytespider|DuckDuckBot|Xing Bot|YandexBot|Sogou Spider|Yahoo! Slurp|Facebot"
bot_pattern_exclude="pingdom|UptimeRobot|StatusCake|Site24x7|Uptime\.com|Monitis|Uptrends|Dotcom-Monitor|Updown\.io|Hetrix|NodePing"

for log in "${logs_to_process[@]}"; do
    # Extract domain name
    domain=$(basename "$(dirname "$(dirname "$log")")")
    
    # Process normal traffic
    awk -v date_threshold="$date_threshold" -F\" '$4 > date_threshold {print $2}' "$log" | \
    awk -v domain="$domain" '{print domain $2}' | \
    grep -v -E "$normal_traffic_pattern" >> "$temp_file"
    
    # Process WordPress traffic
    awk -v date_threshold="$date_threshold" -F\" '$4 > date_threshold {print $2}' "$log" | \
    awk -v domain="$domain" '{print domain $2}' | \
    grep -E "$wordpress_traffic_pattern" | grep -v -E "$static_files_pattern" >> "$temp_wordpress"
    
    # Process 5xx errors
    awk -v date_threshold="$date_threshold" -v domain="$domain" '$4 > date_threshold && $9 >= 500 && $9 < 600 {print domain $7}' "$log" >> "$temp_5xx"
    
    # Process 404 errors
    awk -v date_threshold="$date_threshold" -v domain="$domain" '$4 > date_threshold && $9 == 404 {print domain $7}' "$log" >> "$temp_404"

    # Process bot traffic
    # Store results in variable to check count
    results=$(awk -v domain="$domain" -F\" '$4 > date_threshold {print domain $2}' "$log" | \
    awk -F\" '{print $6}' "$log" | grep -i -E "$bot_pattern_include" | grep -i -v -E "$bot_pattern_exclude" | sort | uniq -c | sort -rn | awk -v min="$bots_min_count" '$1 >= min')

    # Only output if results exist
    if [ ! -z "$results" ]; then
        echo "$domain" >> "$output_bots"
        echo "$results" >> "$output_bots"
        echo " " >> "$output_bots"
    fi

    # Process IP addresses
    # Extract IP addresses and count occurrences
    results=$(awk -v date_threshold="$date_threshold" '$4 > date_threshold {print $1}' "$log" | sort | uniq -c | sort -rn | awk -v min="$ips_min_count" '$1 >= min')

    # Only output if results exist
    if [ ! -z "$results" ]; then
        echo "$domain" >> "$output_ips"
        echo "$results" >> "$output_ips"
        echo " " >> "$output_ips"
    fi
done

# Aggregate and sort results for normal traffic
sort "$temp_file" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_file"

# Aggregate and sort results for WordPress traffic
sort "$temp_wordpress" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_wordpress"

# Aggregate and sort results for 5xx errors
sort "$temp_5xx" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_5xx"

# Aggregate and sort results for 404 errors
sort "$temp_404" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_404"

# Clean up temporary files
rm "$temp_file"
rm "$temp_wordpress"
rm "$temp_5xx"
rm "$temp_404"

# End timing the script
end_time=$(date +%s%3N)
execution_time=$((end_time - start_time))

echo "Results saved to:"
echo "- $output_file"
echo "- $output_wordpress" 
echo "- $output_5xx"
echo "- $output_404"
echo "- $output_bots"
echo "- $output_ips"
echo "Script execution time: ${execution_time}ms"

Shell Script aktualisiert am 05.12.2024

Hierbei werden die Dateien traffic-pages.txt, traffic-5xx.txt, traffic-404.txt, traffic-bots.txt und traffic-ips.txt angelegt, welche die entsprechenden gefilterten Daten enthalten.