fork(2) download
  1. import requests
  2. from bs4 import BeautifulSoup as bs
  3. import re
  4. import unicodedata
  5. import sys
  6.  
  7. # Define your headers once, at the top
  8. HEADERS = {
  9. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
  10. }
  11.  
  12. # --- Helper function for text normalization ---
  13. def normalize_text(text):
  14. """
  15. Normalize text for more robust duplicate checking.
  16. Converts to lowercase, standardizes internal whitespace.
  17. You can add more normalization steps here if needed.
  18. """
  19. if not isinstance(text, str):
  20. return "" # Handle non-string input gracefully
  21.  
  22. # 1. Convert to lowercase
  23. text = text.lower()
  24. # 2. Replace various whitespace characters with a single space
  25. # This handles spaces, tabs, newlines, non-breaking spaces (\xa0), etc.
  26. text = re.sub(r'\s+', ' ', text).strip()
  27. # 3. Optional: Remove or replace problematic characters (e.g., zero-width space \u200b)
  28. text = text.replace('\u200b', '') # Example: remove zero width space
  29.  
  30. # Optional: More advanced Unicode normalization if character representation is an issue
  31. # text = unicodedata.normalize('NFC', text)
  32.  
  33. return text
  34.  
  35. # --- Core Scraping Functions ---
  36.  
  37. def get_total_pages(url):
  38. """
  39. Look inside the .entry container for any <a> tags whose text is a digit.
  40. Returns the highest digit found, or 1 if none or error.
  41. """
  42. print(f"Attempting to find total pages from: {url}")
  43. try:
  44. response = requests.get(url, headers=HEADERS, timeout=30)
  45. response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
  46. soup = bs(response.text, 'html.parser')
  47. content_div = soup.select_one('.entry')
  48. if not content_div:
  49. print("Warning: Could not find .entry container for pagination.")
  50. return 1
  51.  
  52. page_numbers = []
  53. # Look for pagination links within or near the content area
  54. # Common selectors for pagination: div.page-links, div.pagination, .entry a
  55. # Sticking to original scope (.entry a) but mentioning alternatives
  56. pagination_candidates = content_div.select('a') # Look at all links in .entry
  57.  
  58. for a in pagination_candidates:
  59. try:
  60. # Get text including text from nested elements like <span>
  61. text = a.get_text(strip=True)
  62. if text.isdigit():
  63. page_numbers.append(int(text))
  64. except ValueError:
  65. # Ignore links whose text is not purely a digit
  66. pass
  67. except Exception as e:
  68. print(f"Error processing pagination link text: {e}")
  69.  
  70.  
  71. print(f"Found potential page numbers: {page_numbers}")
  72.  
  73. return max(page_numbers) if page_numbers else 1
  74.  
  75. except requests.exceptions.RequestException as e:
  76. print(f"Error fetching total pages from {url}: {e}")
  77. return 1 # Assume 1 page on error
  78. except Exception as e:
  79. print(f"An unexpected error occurred while getting total pages from {url}: {e}")
  80. return 1 # Assume 1 page on unexpected error
  81.  
  82.  
  83. def parse_page_content(url, seen_texts, debug=False, page_number=1, normalize=True):
  84. """
  85. Return only _new_ paragraphs (compared to seen_texts).
  86. Includes optional text normalization for more robust duplicate detection.
  87. """
  88. print(f"Scraping content from page {page_number} ({url})")
  89. try:
  90. response = requests.get(url, headers=HEADERS, timeout=30)
  91. response.raise_for_status() # Raise an exception for bad status codes
  92. soup = bs(response.text, 'html.parser')
  93.  
  94. # Remove non-content elements - ADDED header/footer as common areas for repetition
  95. selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
  96. for sel in selectors_to_decompose:
  97. for tag in soup.select(sel):
  98. tag.decompose()
  99.  
  100. content_div = soup.select_one('.entry')
  101. if not content_div:
  102. if debug:
  103. print(f"Warning: Could not find .entry container on page {page_number}.")
  104. return [] # Return empty list if main content div is not found
  105.  
  106. out = []
  107. for p in content_div.find_all('p'):
  108. original_text = p.get_text(strip=True)
  109.  
  110. # Skip empty paragraphs after stripping
  111. if not original_text:
  112. if debug:
  113. print(f"Skipping empty paragraph on page {page_number}.")
  114. continue
  115.  
  116. # Apply normalization for the check
  117. text_to_check = normalize_text(original_text) if normalize else original_text
  118.  
  119. # --- Duplicate Check ---
  120. # Use the normalized/checked text for the set lookup and add
  121. if text_to_check not in seen_texts:
  122. seen_texts.add(text_to_check)
  123. # Add the *original* text to the output list
  124. out.append(original_text)
  125. if debug:
  126. # Print start of checked text when adding new
  127. print(f"Added new paragraph (checked text): {text_to_check[:100]}...")
  128. elif debug:
  129. # This message confirms the duplicate detection is working for the *checked* text
  130. # Print start of checked text when skipping duplicate
  131. print(f"Duplicate (or similar) text found on page {page_number}, skipping (checked text): {text_to_check[:100]}...")
  132.  
  133.  
  134. if debug:
  135. print(f"Page {page_number}: Found {len(out)} new paragraphs added to output.")
  136. print(f"Seen texts set size: {len(seen_texts)}")
  137. # print a sample of seen texts (the checked version)
  138. seen_sample = list(seen_texts)[:min(5, len(seen_texts))]
  139. print(f"Seen texts sample (checked version): {seen_sample}")
  140.  
  141.  
  142. return out
  143.  
  144. except requests.exceptions.RequestException as e:
  145. print(f"Error fetching content from page {page_number} ({url}): {e}")
  146. return [] # Return empty list on error
  147. except Exception as e:
  148. print(f"An unexpected error occurred while parsing page {page_number} ({url}): {e}")
  149. return [] # Return empty list on unexpected error
  150.  
  151.  
  152. def get_next_page_url(base_url, page_number):
  153. """Append '/2/', '/3/' etc., but leave Page 1 as-is."""
  154. if page_number <= 1:
  155. return base_url
  156. # Ensure base_url doesn't end with a slash already before adding page number
  157. next_url = base_url.rstrip('/') + f'/{page_number}/'
  158. # print(f"Generated URL for page {page_number}: {next_url}") # Optional debug
  159. return next_url
  160.  
  161. def parse_all_pages(start_url, debug=False, normalize=True):
  162. """Orchestrates the scraping of all pages."""
  163. if not start_url:
  164. print("Error: start_url is empty.")
  165. return {
  166. 'page_text': '',
  167. 'pages_scraped': 0,
  168. 'total_pages_detected': 0,
  169. 'total_unique_comparison_strings': 0,
  170. 'error': 'Start URL was empty'
  171. }
  172.  
  173. total_pages = get_total_pages(start_url)
  174. print(f"Total pages detected (or assumed): {total_pages}")
  175.  
  176. seen_texts = set() # This set tracks all unique paragraph strings *used for comparison*
  177. chunks = []
  178. pages_scraped_count = 0
  179.  
  180. # Loop through potential pages - attempt fetching a few pages beyond detected total
  181. # This helps if total_pages is slightly off or detection fails.
  182. # parse_page_content returning empty will trigger break if it happens after the first page.
  183. # Using a range like total_pages + 5 for more buffer, but the break condition is key.
  184. for i in range(1, total_pages + 5):
  185. url = get_next_page_url(start_url, i)
  186.  
  187. if debug:
  188. print(f"\n--- Attempting to scrape Page {i} ---")
  189.  
  190. # Pass the normalize flag down
  191. new_paras = parse_page_content(url, seen_texts, debug=debug, page_number=i, normalize=normalize)
  192.  
  193. # Break condition: If we get *no* new paragraphs on any page *after* the first page, assume end.
  194. # This is more robust than relying solely on total_pages prediction.
  195. if not new_paras and i > 1:
  196. print(f"No *new* content added from page {i} and it's not the first page. Assuming end of article.")
  197. # Decide if you want to *stop* completely or just note it and continue for a few more pages
  198. # The current range +5 will attempt a few more pages before the loop ends naturally.
  199. # To stop immediately: break
  200.  
  201. if new_paras:
  202. # Append the original text paragraphs to the chunks
  203. chunks.append(f"Page {i}\n\n" + "\n\n".join(new_paras))
  204. pages_scraped_count += 1
  205. print(f"Successfully scraped page {i}. Added {len(new_paras)} new paragraphs. Total unique comparison strings seen: {len(seen_texts)}")
  206. elif debug and i > 1: # Print this warning only for pages after the first if no new content was found
  207. print(f"Page {i} returned no paragraphs (either empty, content div not found, or all paragraphs were duplicates based on comparison text).")
  208. elif debug and i == 1 and not new_paras: # Special warning if even the first page had no content
  209. print(f"Warning: Page 1 returned no paragraphs. Check URL and selectors.")
  210. return {
  211. 'page_text': '',
  212. 'pages_scraped': 0,
  213. 'total_pages_detected': total_pages,
  214. 'total_unique_comparison_strings': len(seen_texts),
  215. 'error': 'No content found on the first page'
  216. }
  217.  
  218.  
  219. full_text = "\n\n".join(chunks).strip()
  220.  
  221. # Add a final check or summary for debugging
  222. if debug:
  223. print(f"\n--- Scraping Complete ---")
  224. # i will be one more than the last attempted page if break occurred, or loop finished
  225. print(f"Total pages attempted (approx): {i if i > 1 else 1}") # Report the last page number tried
  226. print(f"Pages successfully scraped (with new content): {pages_scraped_count}")
  227. print(f"Total unique comparison strings collected: {len(seen_texts)}")
  228.  
  229.  
  230. return {
  231. 'page_text': full_text,
  232. 'pages_scraped': pages_scraped_count,
  233. 'total_pages_detected': total_pages,
  234. 'total_unique_comparison_strings': len(seen_texts)
  235. }
  236.  
  237. # --- Example usage: ---
  238. # Assuming 'input_data' is provided from your environment.
  239. # If running standalone, you need to define 'input_data' manually.
  240.  
  241. # Check if input_data is defined (e.g., by an external system)
  242. if 'input_data' not in locals():
  243. print("Warning: 'input_data' dictionary not found. Defining a placeholder for standalone execution.")
  244. # Define a placeholder/default input_data for local testing
  245. # *** REPLACE THIS WITH YOUR ACTUAL START URL ***
  246. input_data = {'initial_url': 'YOUR_URL_HERE'}
  247.  
  248. initial_url_to_scrape = input_data.get('initial_url', '').strip() # Use strip() in case of whitespace
  249.  
  250.  
  251. if not initial_url_to_scrape or initial_url_to_scrape == 'YOUR_URL_HERE':
  252. print("Please provide a valid URL to scrape via 'input_data'.")
  253. print("Example: input_data = {'initial_url': 'https://w...content-available-to-author-only...e.com/article/page/1/'}")
  254. # Depending on your environment, you might exit or raise an error here
  255. # sys.exit(1) # Uncomment this line if you want the script to stop when no URL is given
  256.  
  257. else:
  258. print(f"Starting scrape for: {initial_url_to_scrape}")
  259. # Set debug=True to see logging
  260. # Set normalize=True to use text normalization for duplicate checking (recommended)
  261. # Set normalize=False to revert to exact string matching only
  262. output = parse_all_pages(initial_url_to_scrape, debug=True, normalize=True)
  263.  
  264. # Print the results
  265. print("\n--- Final Output Summary ---")
  266. print(f"Pages scraped with new content: {output.get('pages_scraped')}")
  267. print(f"Total unique comparison strings collected: {output.get('total_unique_comparison_strings')}")
  268. print(f"Initial total pages detected: {output.get('total_pages_detected')}")
  269. print("\n--- Extracted Text ---")
  270. # Use repr() for a small part to see hidden characters if needed for debugging
  271. # print(repr(output.get('page_text', '')[:500]))
  272. print(output.get('page_text')) # This prints the full combined text
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
Main.java:1: error: '.' expected
import requests
               ^
Main.java:2: error: ';' expected
from bs4 import BeautifulSoup as bs
    ^
Main.java:2: error: '.' expected
from bs4 import BeautifulSoup as bs
                             ^
Main.java:2: error: ';' expected
from bs4 import BeautifulSoup as bs
                                ^
Main.java:3: error: '.' expected
import re
         ^
Main.java:4: error: '.' expected
import unicodedata
                  ^
Main.java:7: error: illegal character: '#'
# Define your headers once, at the top
^
Main.java:9: error: unclosed character literal
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    ^
Main.java:9: error: unclosed character literal
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
               ^
Main.java:9: error: unclosed character literal
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                  ^
Main.java:9: error: class, interface, or enum expected
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                                                 ^
Main.java:9: error: class, interface, or enum expected
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                                                        ^
Main.java:9: error: illegal line end in character literal
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                                                            ^
Main.java:12: error: illegal character: '#'
# --- Helper function for text normalization ---
^
Main.java:14: error: unclosed string literal
    """
      ^
Main.java:18: error: unclosed string literal
    """
      ^
Main.java:20: error: illegal character: '#'
         return "" # Handle non-string input gracefully
                   ^
Main.java:22: error: illegal character: '#'
    # 1. Convert to lowercase
    ^
Main.java:24: error: illegal character: '#'
    # 2. Replace various whitespace characters with a single space
    ^
Main.java:25: error: illegal character: '#'
    #    This handles spaces, tabs, newlines, non-breaking spaces (\xa0), etc.
    ^
Main.java:25: error: illegal character: '\'
    #    This handles spaces, tabs, newlines, non-breaking spaces (\xa0), etc.
                                                                   ^
Main.java:26: error: illegal escape character
    text = re.sub(r'\s+', ' ', text).strip()
                     ^
Main.java:26: error: unclosed character literal
    text = re.sub(r'\s+', ' ', text).strip()
                   ^
Main.java:26: error: unclosed character literal
    text = re.sub(r'\s+', ' ', text).strip()
                       ^
Main.java:27: error: illegal character: '#'
    # 3. Optional: Remove or replace problematic characters (e.g., zero-width space \u200b)
    ^
Main.java:27: error: illegal character: '\u200b'
    # 3. Optional: Remove or replace problematic characters (e.g., zero-width space \u200b)
                                                                                         ^
Main.java:28: error: empty character literal
    text = text.replace('\u200b', '') # Example: remove zero width space
                                  ^
Main.java:28: error: illegal character: '#'
    text = text.replace('\u200b', '') # Example: remove zero width space
                                      ^
Main.java:30: error: illegal character: '#'
    # Optional: More advanced Unicode normalization if character representation is an issue
    ^
Main.java:31: error: illegal character: '#'
    # text = unicodedata.normalize('NFC', text)
    ^
Main.java:31: error: unclosed character literal
    # text = unicodedata.normalize('NFC', text)
                                   ^
Main.java:31: error: unclosed character literal
    # text = unicodedata.normalize('NFC', text)
                                       ^
Main.java:35: error: illegal character: '#'
# --- Core Scraping Functions ---
^
Main.java:38: error: unclosed string literal
    """
      ^
Main.java:41: error: unclosed string literal
    """
      ^
Main.java:45: error: illegal character: '#'
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
                                    ^
Main.java:46: error: unclosed character literal
        soup = bs(response.text, 'html.parser')
                                 ^
Main.java:46: error: unclosed character literal
        soup = bs(response.text, 'html.parser')
                                             ^
Main.java:47: error: unclosed character literal
        content_div = soup.select_one('.entry')
                                      ^
Main.java:47: error: unclosed character literal
        content_div = soup.select_one('.entry')
                                             ^
Main.java:53: error: illegal character: '#'
        # Look for pagination links within or near the content area
        ^
Main.java:54: error: illegal character: '#'
        # Common selectors for pagination: div.page-links, div.pagination, .entry a
        ^
Main.java:55: error: illegal character: '#'
        # Sticking to original scope (.entry a) but mentioning alternatives
        ^
Main.java:56: error: illegal character: '#'
        pagination_candidates = content_div.select('a') # Look at all links in .entry
                                                        ^
Main.java:60: error: illegal character: '#'
                # Get text including text from nested elements like <span>
                ^
Main.java:65: error: illegal character: '#'
                # Ignore links whose text is not purely a digit
                ^
Main.java:77: error: illegal character: '#'
        return 1 # Assume 1 page on error
                 ^
Main.java:80: error: illegal character: '#'
        return 1 # Assume 1 page on unexpected error
                 ^
Main.java:84: error: unclosed string literal
    """
      ^
Main.java:87: error: unclosed string literal
    """
      ^
Main.java:91: error: illegal character: '#'
        response.raise_for_status() # Raise an exception for bad status codes
                                    ^
Main.java:92: error: unclosed character literal
        soup = bs(response.text, 'html.parser')
                                 ^
Main.java:92: error: unclosed character literal
        soup = bs(response.text, 'html.parser')
                                             ^
Main.java:94: error: illegal character: '#'
        # Remove non-content elements - ADDED header/footer as common areas for repetition
        ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                  ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                       ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                          ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                         ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                            ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                  ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                     ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                            ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                               ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                                     ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                                        ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                                               ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                                                  ^
Main.java:95: error: unclosed character literal
        selectors_to_decompose = ['form', 'div.page-links', 'aside', 'script', 'style', 'header', 'footer']
                                                                                                         ^
Main.java:100: error: unclosed character literal
        content_div = soup.select_one('.entry')
                                      ^
Main.java:100: error: unclosed character literal
        content_div = soup.select_one('.entry')
                                             ^
Main.java:104: error: illegal character: '#'
            return [] # Return empty list if main content div is not found
                      ^
Main.java:110: error: illegal character: '#'
            # Skip empty paragraphs after stripping
            ^
Main.java:116: error: illegal character: '#'
            # Apply normalization for the check
            ^
Main.java:119: error: illegal character: '#'
            # --- Duplicate Check ---
            ^
Main.java:120: error: illegal character: '#'
            # Use the normalized/checked text for the set lookup and add
            ^
Main.java:123: error: illegal character: '#'
                # Add the *original* text to the output list
                ^
Main.java:126: error: illegal character: '#'
                    # Print start of checked text when adding new
                    ^
Main.java:129: error: illegal character: '#'
                # This message confirms the duplicate detection is working for the *checked* text
                ^
Main.java:130: error: illegal character: '#'
                # Print start of checked text when skipping duplicate
                ^
Main.java:137: error: illegal character: '#'
            # print a sample of seen texts (the checked version)
            ^
Main.java:146: error: illegal character: '#'
        return [] # Return empty list on error
                  ^
Main.java:149: error: illegal character: '#'
        return [] # Return empty list on unexpected error
                  ^
Main.java:156: error: illegal character: '#'
    # Ensure base_url doesn't end with a slash already before adding page number
    ^
Main.java:156: error: unclosed character literal
    # Ensure base_url doesn't end with a slash already before adding page number
                           ^
Main.java:157: error: unclosed character literal
    next_url = base_url.rstrip('/') + f'/{page_number}/'
                                       ^
Main.java:157: error: illegal line end in character literal
    next_url = base_url.rstrip('/') + f'/{page_number}/'
                                                       ^
Main.java:158: error: illegal character: '#'
    # print(f"Generated URL for page {page_number}: {next_url}") # Optional debug
    ^
Main.java:158: error: illegal character: '#'
    # print(f"Generated URL for page {page_number}: {next_url}") # Optional debug
                                                                 ^
Main.java:166: error: unclosed character literal
            'page_text': '',
            ^
Main.java:166: error: unclosed character literal
            'page_text': '',
                      ^
Main.java:166: error: empty character literal
            'page_text': '',
                         ^
Main.java:167: error: unclosed character literal
            'pages_scraped': 0,
            ^
Main.java:167: error: unclosed character literal
            'pages_scraped': 0,
                          ^
Main.java:168: error: unclosed character literal
            'total_pages_detected': 0,
            ^
Main.java:168: error: unclosed character literal
            'total_pages_detected': 0,
                                 ^
Main.java:169: error: unclosed character literal
            'total_unique_comparison_strings': 0,
            ^
Main.java:169: error: unclosed character literal
            'total_unique_comparison_strings': 0,
                                            ^
Main.java:170: error: unclosed character literal
            'error': 'Start URL was empty'
            ^
Main.java:170: error: unclosed character literal
            'error': 'Start URL was empty'
                  ^
Main.java:170: error: unclosed character literal
            'error': 'Start URL was empty'
                     ^
100 errors
stdout
Standard output is empty