python 3.x - using regex to substitute the half space in two Persian strings in a list? -
the data working on follows: (aj_sim فتنه برانگیز) ) ) ) ) (mv (adj (aj_sim روشن) ) (v (v_pres_pos_3 می گردد) ) ) ) ) ) ) ) there persian words separated space , when split contents of each file, considered separate strings. have find each 2 persian strings after previous 1 " برانگیز"، "فنته" , replace space half-space. must in same list tags. mean change have find these kinds of strings , join them half space , save them previous strings in list same order. here code:
import os import codecs import re ###opening files folder in directory matches=[] root, dirs, files in os.walk("c:\\test2"): file in files: if file.endswith(".pts"): matches.append(os.path.join(root, file)) print(matches) print(len(matches)) ###reading files i, f in enumerate(matches): codecs.open(f, "r", "utf-8") fp: text=fp.read().split() #print(text) #print (len(text)) print(type(text)) ###substituting space half-space original_pattern= r'\b(\w)\s(\w)' new_pattern= '\\1\u200c\\2' list_words=[] in text: words= re.sub(original_pattern, new_pattern, a) list_words.append(words) #print(type(a)) print(list_words)
here sample of splitted text of each file: ['(root', '(s', '(vp', '(vps', '(np', '(np', '(n_ez', '(n_pl_com_ez_loc', 'کشورهای)', ')', '(adj', '(aj_sim', 'درحال', 'توسعه)', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(np', '(n_ya', '(n_ya_sing_com', 'زمانی)', ')', '(nid', '1)', ')', '(vp', '(pp', '(prep', '(p', 'به)', ')', '(n', '(n_sing_com', 'استقلال)', ')', ')', '(mv', '(n', '(n_sing_com', 'دست)', ')', '(v', '(v_sim_pos_pa_6', 'یافتند)', ')', ')', ')', ')', ')', '(cl_1', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(adv', '(adv_sim_genr', 'مآلاً)', ')', '(vp', '(pp', '(pp', '(prep', '(p', 'از)', ')', '(np', '(n_ez', '(n_sing_com_ez', 'سنت)', ')', '(np', '(np_c', '(n', '(n_pl_com', 'علوم)', ')', '(n_c', '(conj', '(conj', 'و)', ')', '(n_ya', '(n_ya_sing_com', 'تکنولوژی', 'ای)', ')', ')', ')', '(cl', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(aux', '(v_nin_pos_aux', 'بتوان)', ')', '(vp', '(np', '(pron', '(pro_demo_sing', 'آن)', ')', '(postp', '(postp', 'را)', ')', ')', '(vp', '(adj', '(aj_sim', 'مدرن)', ')', '(mv', '(n', '(n_sing_com', 'تلقی)', ')', '(v', '(v_sim_pos_pa_3', 'نمود)', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(adj', '(aj_sim', 'برخوردار)', ')', '(v', '(v_cop_neg_pa_6', 'نبودند)', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '.)', ')', ')', ')']
the change expect strings 'درحال', 'توسعه)' after each other joined half space.
i'm not experienced in re i'm not sure how sub works here's put persian words (with limited re knowledge):
import re pervious_is_persian= false # add persian letters here all_persian_letters_r = r'[آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئؤأإة]' text = ['(root', '(s', '(vp', '(vps', '(np', '(np', '(n_ez', '(n_pl_com_ez_loc', 'کشورهای)', ')', '(adj', '(aj_sim', 'درحال', 'توسعه)', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(np', '(n_ya', '(n_ya_sing_com', 'زمانی)', ')', '(nid', '1)', ')', '(vp', '(pp', '(prep', '(p', 'به)', ')', '(n', '(n_sing_com', 'استقلال)', ')', ')', '(mv', '(n', '(n_sing_com', 'دست)', ')', '(v', '(v_sim_pos_pa_6', 'یافتند)', ')', ')', ')', ')', ')', '(cl_1', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(adv', '(adv_sim_genr', 'مآلاً)', ')', '(vp', '(pp', '(pp', '(prep', '(p', 'از)', ')', '(np', '(n_ez', '(n_sing_com_ez', 'سنت)', ')', '(np', '(np_c', '(n', '(n_pl_com', 'علوم)', ')', '(n_c', '(conj', '(conj', 'و)', ')', '(n_ya', '(n_ya_sing_com', 'تکنولوژی', 'ای)', ')', ')', ')', '(cl', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(aux', '(v_nin_pos_aux', 'بتوان)', ')', '(vp', '(np', '(pron', '(pro_demo_sing', 'آن)', ')', '(postp', '(postp', 'را)', ')', ')', '(vp', '(adj', '(aj_sim', 'مدرن)', ')', '(mv', '(n', '(n_sing_com', 'تلقی)', ')', '(v', '(v_sim_pos_pa_3', 'نمود)', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(adj', '(aj_sim', 'برخوردار)', ')', '(v', '(v_cop_neg_pa_6', 'نبودند)', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '.)', ')', ')', ')'] words = [] in range(len(text)): if pervious_is_persian: if re.search(all_persian_letters_r,text[i]) not none: # current persian text[i]=text[i-1]+'\u200c'+text[i] # else: # current isn't persian # add text[i-1] , text[i] list words.append(text[i-1]) words.append(text[i]) pervious_is_persian = false else: # previous isn't persian if re.search(all_persian_letters_r,text[i]) not none: # current persian pervious_is_persian = true else: # current isn't persian words.append(text[i]) print(words)
i'm sure can improve , implement in program somehow.
edited answer above:
needed?
if so, it's easier not split text in first place:
import re text = fp.read() all_persian_letters_r = r'[آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئؤأإة]' to_change = re.findall(all_persian_letters_r+'+\s',text) in to_change: text = text.replace(i,i[:-1]+'\u200c') # 'ح' missing letters. why had # problems. might wanna add vowels (and else need # finding) too. # before edit came hacky way circumvent # adding more letters better solution.
Comments
Post a Comment