Coverage for pyguymer3/download_text.py: 6%

17 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-08 18:47 +0000

1#!/usr/bin/env python3 

2 

3# Define function ... 

4def download_text( 

5 sess, 

6 url, 

7 /, 

8 *, 

9 cookies = None, 

10 debug = __debug__, 

11 ensureNFC = True, 

12 headers = None, 

13 timeout = 10.0, 

14 verify = True, 

15): 

16 """GET a URL and return the text 

17 

18 This function performs a HTTP GET operation on a URL and returns the content 

19 as text, and optionally ensure that the Unicode encoding is NFC. 

20 

21 Parameters 

22 ---------- 

23 sess : requests.sessions.Session 

24 the :mod:`requests` session to use 

25 url : str 

26 the URL 

27 cookies : dict, optional 

28 the cookie jar 

29 debug : bool, optional 

30 print debug messages 

31 ensureNFC : bool, optional 

32 ensure that the Unicode encoding is NFC 

33 headers : dict, optional 

34 extra headers to send 

35 timeout : float, optional 

36 the timeout of the GET request 

37 verify : bool, optional 

38 verify the server's certificates 

39 

40 Returns 

41 ------- 

42 text : bool, str 

43 `False` if unsuccessful or a `str` if successful 

44 

45 Notes 

46 ----- 

47 Copyright 2017 Thomas Guymer [1]_ 

48 

49 References 

50 ---------- 

51 .. [1] PyGuymer3, https://github.com/Guymer/PyGuymer3 

52 """ 

53 

54 # Import standard modules ... 

55 import html 

56 import unicodedata 

57 

58 # Import sub-functions ... 

59 from .download import download 

60 

61 # Populate default values ... 

62 if cookies is None: 

63 cookies = {} 

64 if headers is None: 

65 headers = {} 

66 

67 # ************************************************************************** 

68 

69 # Try to download the page ... 

70 resp = download( 

71 sess, 

72 "get", 

73 url, 

74 cookies = cookies, 

75 headers = headers, 

76 timeout = timeout, 

77 verify = verify, 

78 ) 

79 

80 # Check response ... 

81 if resp is False: 

82 return False 

83 

84 # Convert HTML characters ... 

85 text = html.unescape(resp.text) 

86 

87 # Change Unicode encoding if needed ... 

88 if ensureNFC and not unicodedata.is_normalized("NFC", text): 

89 if debug: 

90 print(f"DEBUG: Converting \"{url}\" to Unicode NFC.") 

91 text = unicodedata.normalize("NFC", text) 

92 

93 # Return answer ... 

94 return text