A Memory Issue in simple C Program Concerning Char Pointers -
issue
i have memory issue arises when use large text file (sourced project gutenberg: alice in wonderland), doesn't happen in smaller text files (a 2 line test text file , maya angelou poem).
in large text file receive segmentation fault , when using valgrind reports "invalid write of size 1" , "invalid read of size 1". upon inspection seems in function have written gets each word passed in line. seems complain single address location being 0 bytes after block of size 50 alloc'd.
i have looked @ code malloc 50 characters char pointer, unsure going wrong in large text file not going wrong in smaller text files. seems odd when run in debug mode wrote goes end , reaches eof verify feof(fp).
i hoping can spot going on , have missed since don't program in c often. thank in advance in understanding going on.
program overview
i pulled out parts of program writing , put simple main make things clearer , make easier spot issue. program breaks down to:
- pass in text file
- open text file pointer "r+"
- loop line line fgets
- replace '\n' or '\r' '\0' in each line
- loop through line , extract each word (tokenized isspace()) until '\0' reached
- hashes word
- free pointers used
- close file pointer
the valgrind output shows issue occurring in getword() function. have looked @ , tried outputting character character , inspecting it, not seeing why segfault happens , in large text file.
code
main.c
/* * license: gplv3 * * file: main.c * * description: program. * * author: brandon authier (hblkr) * date: 6 aug 2017 * version: 1.0 * */ // includes program run #include <stdio.h> #include <stdlib.h> #include <stdbool.h> #include <sys/types.h> #include <ctype.h> #include <string.h> #include <unistd.h> // global debug bool debug = false; /* djb2 hash function sourced online. */ unsigned long hash(unsigned char *str) { unsigned long hash = 5381; int c; while (c = *str++) hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ return hash; } /* * using file pointer, first word , pass copy of word * * * @returns: boolean of true when word built * * todo: find issue in here causing memory error * */ bool getword(char* line, int* idx, char* word) { int wordidx = 0; // build word character character ( ; line[*idx] != '\0'; *idx = (*idx + 1)) { if (isalpha(line[*idx]) || (line[*idx] == '-')) { word[wordidx++] = tolower(line[*idx]); } else if (isspace(line[*idx])) { *idx += 1; return true; } } return true; } /* * process file. tokenize each line , process each word. * * todo: process file. */ void processfile(file* textfp) { // variables hold: // line text // word once parsed // index keep track of line char line[1024] = ""; unsigned char* word = malloc(sizeof(unsigned char) * 50); int* lineidx = malloc(sizeof(int)); int linecount = 1; // set line index keep track of line *lineidx = 0; while (fgets(line, sizeof(line), textfp) != null) { // line character count int charcount = 0; int wordcount = 1; for(int m = 0; line[m] != '\0'; m++) { // counting spaces, can rough estimate of how many words // in each line. (totalspaces + 1) if ((line[m] == ' ') && (line[m - 1] != ' ')) { wordcount++; } if(line[m] != '\n' && line[m] != '\r') { charcount++; } else { line[m] = '\0'; } } if (debug == true) { fprintf(stdout, "line %d:\n", linecount); fprintf(stdout, " words in line: %d\n", wordcount); fprintf(stdout, " charcount: %d\n", charcount); fprintf(stdout, " lineidx: %d\n", *lineidx); fprintf(stdout, " value: \"%s\"\n\n", line); } // word while (*lineidx < (charcount - 1)) { // sanitize word (int = 0; < 50; i++) { word[i] = '\0'; } getword(line, lineidx, word); unsigned long hash_output = hash(word); if (debug == true) { fprintf(stdout, "key: %10d,\t", hash_output); fprintf(stdout, "value: %8s,\t", word); fprintf(stdout, "lineidx: %2d\n", *lineidx); } } // end while word if (debug == true) { fprintf(stdout, "\n========\n\n"); } // reset line index 0 new line *lineidx = 0; linecount++; } // end while line if (debug == true) { if (feof(textfp)) { fprintf(stderr, "reached feof.\n"); } } // free pointers free(lineidx); free(word); } // main int main (int argc, char* argv[]) { //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // verify command line arguments necessary program //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // user did not pass in argument if (argc == 1) { fprintf(stderr, "usage: main afile.txt\n"); exit(-1); } // grab text file, possibly turn on debug, , ignore other arguments if (argc >= 3) { // debug purposes if (strcmp("-d", argv[2]) == 0) { debug = true; fprintf(stdout, "+++++++++++++++++++++++++++++++++++++++\n"); fprintf(stdout, "+ [debugging on] +\n"); fprintf(stdout, "+++++++++++++++++++++++++++++++++++++++\n\n"); } } //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // process passed in text file //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // open file reading file* fp = fopen(argv[1], "r+"); // if fp null, file not exist if (fp == 0) { fprintf(stderr, "file not exist.\n"); exit(1); } if (debug == true) { fprintf(stdout, "file exists.\n"); } if (debug == true) { fprintf(stdout, "\n"); fprintf(stdout, "================================================================================\n"); } // process file processfile(fp); // close file pointer if (fclose(fp) != 0) { fprintf(stderr, "file did not close.\n"); } if (debug == true) { fprintf(stdout, "file closed.\n"); } if (debug == true) { fprintf(stdout, "================================================================================\n"); fprintf(stdout, "\n"); } //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // free memory hasn't been freed yet //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ exit(0); }
the following function seems issue arises somehow.
getword()
/* * using file pointer, first word , pass copy of word * * * @returns: boolean of true when word built * * todo: find issue in here causing memory error * */ bool getword(char* line, int* idx, char* word) { int wordidx = 0; // build word character character ( ; line[*idx] != '\0'; *idx = (*idx + 1)) { if (isalpha(line[*idx]) || (line[*idx] == '-')) { word[wordidx++] = tolower(line[*idx]); } else if (isspace(line[*idx])) { *idx += 1; return true; } } return true; }
error output
after compiling , running here output when not running in debug mode (debug verbose mode myself):
./main alice.txt
segmentation fault (core dumped)
valgrind -q --leak-check=full ./main alice.txt
==7320== invalid write of size 1 ==7320== @ 0x400a24: getword (in /tmp/main) ==7320== 0x400c7b: processfile (in /tmp/main) ==7320== 0x400f32: main (in /tmp/main) ==7320== address 0x51f62e2 0 bytes after block of size 50 alloc'd ==7320== @ 0x4c28bf6: malloc (vg_replace_malloc.c:299) ==7320== 0x400ae5: processfile (in /tmp/main) ==7320== 0x400f32: main (in /tmp/main) ==7320== ==7320== invalid read of size 1 ==7320== @ 0x400972: hash (in /tmp/main) ==7320== 0x400c87: processfile (in /tmp/main) ==7320== 0x400f32: main (in /tmp/main) ==7320== address 0x51f62e2 0 bytes after block of size 50 alloc'd ==7320== @ 0x4c28bf6: malloc (vg_replace_malloc.c:299) ==7320== 0x400ae5: processfile (in /tmp/main) ==7320== 0x400f32: main (in /tmp/main) ==7320==
text files
here 3 have tested program with:
test.txt
isn't test, it's lot of fun! how did get-here?... well, i'm not sure either.
maya.txt
pretty women wonder secret lies. i'm not cute or built suit fashion model's size when start tell them, think i'm telling lies. say, it's in reach of arms span of hips, stride of step, curl of lips. i'm woman phenomenally. phenomenal woman, that's me. walk room cool please, , man, fellows stand or fall down on knees. swarm around me, hive of honey bees. say, it's fire in eyes, , flash of teeth, swing in waist, , joy in feet. i'm woman phenomenally. phenomenal woman, that's me. men have wondered see in me. try can't touch inner mystery. when try show them still can't see. say, it's in arch of back, sun of smile, ride of breasts, grace of style. i'm woman phenomenally. phenomenal woman, that's me. understand why head's not bowed. don't shout or jump or have talk real loud. when see me passing ought make proud. say, it's in click of heels, bend of hair, palm of hand, need of care, 'cause i'm woman phenomenally. phenomenal woman, that's me.
alice.txt
here text
in comment, brandon authier claims posted code close mcve (minimal, complete, verifiable example) — has 227 lines.
i'll contend file 227 lines on twice big necessary; not mcve.
the code below saved in file so-4578-8729-mcve.c
. has 96 lines, , compiles cleanly on mac running macos sierra 10.12.6 using gcc 7.2.0 , valgrind 3.13.0.svn when compiled using command:
$ gcc -o3 -g -std=c11 -wall -wextra -werror -wmissing-prototypes \ > -wstrict-prototypes so-4578-8729-mcve.c -o so-4578-8729-mcve $
and runs cleanly under valgrind on 'alice in wonderland':
$ valgrind --suppressions=etc/suppressions-macos-10.12.5 -- \ > so-4578-8729-mcve src/data-files/alice-in-wonderland-pg19033.txt ==12363== memcheck, memory error detector ==12363== copyright (c) 2002-2017, , gnu gpl'd, julian seward et al. ==12363== using valgrind-3.13.0.svn , libvex; rerun -h copyright info ==12363== command: so-4578-8729-mcve src/data-files/alice-in-wonderland-pg19033.txt ==12363== ==12363== ==12363== heap summary: ==12363== in use @ exit: 18,188 bytes in 161 blocks ==12363== total heap usage: 180 allocs, 19 frees, 28,482 bytes allocated ==12363== ==12363== leak summary: ==12363== lost: 0 bytes in 0 blocks ==12363== indirectly lost: 0 bytes in 0 blocks ==12363== possibly lost: 0 bytes in 0 blocks ==12363== still reachable: 0 bytes in 0 blocks ==12363== suppressed: 18,188 bytes in 161 blocks ==12363== ==12363== counts of detected , suppressed errors, rerun with: -v ==12363== error summary: 0 errors 0 contexts (suppressed: 2 2) $
the fixed code includes bug fix identified bluepixy in comment. cleaner respect unsigned char
vs (plain) char
. doesn't have debug code, or comments.
#include <ctype.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> static bool getword(char *line, int *idx, char *word) { int wordidx = 0; ( ; line[*idx] != '\0'; *idx = (*idx + 1)) { if (isalpha((unsigned char)line[*idx]) || (line[*idx] == '-')) { word[wordidx++] = tolower((unsigned char)line[*idx]); } else if (isspace((unsigned char)line[*idx])) { *idx += 1; return true; } } return true; } static void processfile(file *textfp) { char line[1024] = ""; char *word = malloc(sizeof(unsigned char) * 50); int *lineidx = malloc(sizeof(int)); int linecount = 1; *lineidx = 0; while (fgets(line, sizeof(line), textfp) != null) { int charcount = 0; int wordcount = 1; (int m = 0; line[m] != '\0'; m++) { if ((line[m] == ' ') && (m == 0 || line[m - 1] != ' ')) { wordcount++; } if (line[m] != '\n' && line[m] != '\r') { charcount++; } else { line[m] = '\0'; } } while (*lineidx < (charcount - 1)) { (int = 0; < 50; i++) { word[i] = '\0'; } getword(line, lineidx, word); } *lineidx = 0; linecount++; } free(lineidx); free(word); } int main(int argc, char *argv[]) { if (argc != 2) { fprintf(stderr, "usage: %s afile.txt\n", argv[0]); exit(-1); } file *fp = fopen(argv[1], "r+"); if (fp == 0) { fprintf(stderr, "failed open file '%s' reading\n", argv[1]); exit(1); } processfile(fp); if (fclose(fp) != 0) fprintf(stderr, "failed close file '%s'.\n", argv[1]); return(0); }
this close minimal; still reduced further. -wmissing-prototypes -wstrict-prototypes
options require functions declared static
— or declared before defined. since don't need visible outside source file, made static
. 1 advantage of compiler tell me result of hash()
not used, call removed, , when removed, hash()
function unused, removed. not uses such stringent compilation options, prefer assurances give me.
Comments
Post a Comment