python 3.x - using regex to substitute the half space in two Persian strings in a list? -


the data working on follows: (aj_sim فتنه برانگیز) ) ) ) ) (mv (adj (aj_sim روشن) ) (v (v_pres_pos_3 می گردد) ) ) ) ) ) ) ) there persian words separated space , when split contents of each file, considered separate strings. have find each 2 persian strings after previous 1 " برانگیز"، "فنته" , replace space half-space. must in same list tags. mean change have find these kinds of strings , join them half space , save them previous strings in list same order. here code:

import os import codecs import re    ###opening files folder in directory matches=[] root, dirs, files in os.walk("c:\\test2"):     file in files:         if file.endswith(".pts"):             matches.append(os.path.join(root, file)) print(matches) print(len(matches))  ###reading files  i, f in enumerate(matches):     codecs.open(f, "r", "utf-8") fp:         text=fp.read().split()         #print(text)         #print (len(text))         print(type(text))   ###substituting space half-space                original_pattern= r'\b(\w)\s(\w)'         new_pattern= '\\1\u200c\\2'         list_words=[]         in text:             words= re.sub(original_pattern, new_pattern, a)             list_words.append(words)             #print(type(a))             print(list_words) 

here sample of splitted text of each file: ['(root', '(s', '(vp', '(vps', '(np', '(np', '(n_ez', '(n_pl_com_ez_loc', 'کشورهای)', ')', '(adj', '(aj_sim', 'درحال', 'توسعه)', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(np', '(n_ya', '(n_ya_sing_com', 'زمانی)', ')', '(nid', '1)', ')', '(vp', '(pp', '(prep', '(p', 'به)', ')', '(n', '(n_sing_com', 'استقلال)', ')', ')', '(mv', '(n', '(n_sing_com', 'دست)', ')', '(v', '(v_sim_pos_pa_6', 'یافتند)', ')', ')', ')', ')', ')', '(cl_1', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(adv', '(adv_sim_genr', 'مآلاً)', ')', '(vp', '(pp', '(pp', '(prep', '(p', 'از)', ')', '(np', '(n_ez', '(n_sing_com_ez', 'سنت)', ')', '(np', '(np_c', '(n', '(n_pl_com', 'علوم)', ')', '(n_c', '(conj', '(conj', 'و)', ')', '(n_ya', '(n_ya_sing_com', 'تکنولوژی', 'ای)', ')', ')', ')', '(cl', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(aux', '(v_nin_pos_aux', 'بتوان)', ')', '(vp', '(np', '(pron', '(pro_demo_sing', 'آن)', ')', '(postp', '(postp', 'را)', ')', ')', '(vp', '(adj', '(aj_sim', 'مدرن)', ')', '(mv', '(n', '(n_sing_com', 'تلقی)', ')', '(v', '(v_sim_pos_pa_3', 'نمود)', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(adj', '(aj_sim', 'برخوردار)', ')', '(v', '(v_cop_neg_pa_6', 'نبودند)', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '.)', ')', ')', ')']

the change expect strings 'درحال', 'توسعه)' after each other joined half space.

i'm not experienced in re i'm not sure how sub works here's put persian words (with limited re knowledge):

import re  pervious_is_persian= false # add persian letters here all_persian_letters_r = r'[آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئؤأإة]' text = ['(root', '(s', '(vp', '(vps', '(np', '(np', '(n_ez', '(n_pl_com_ez_loc', 'کشورهای)', ')', '(adj', '(aj_sim', 'درحال', 'توسعه)', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(np', '(n_ya', '(n_ya_sing_com', 'زمانی)', ')', '(nid', '1)', ')', '(vp', '(pp', '(prep', '(p', 'به)', ')', '(n', '(n_sing_com', 'استقلال)', ')', ')', '(mv', '(n', '(n_sing_com', 'دست)', ')', '(v', '(v_sim_pos_pa_6', 'یافتند)', ')', ')', ')', ')', ')', '(cl_1', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(adv', '(adv_sim_genr', 'مآلاً)', ')', '(vp', '(pp', '(pp', '(prep', '(p', 'از)', ')', '(np', '(n_ez', '(n_sing_com_ez', 'سنت)', ')', '(np', '(np_c', '(n', '(n_pl_com', 'علوم)', ')', '(n_c', '(conj', '(conj', 'و)', ')', '(n_ya', '(n_ya_sing_com', 'تکنولوژی', 'ای)', ')', ')', ')', '(cl', '(conj', '(conj', 'که)', ')', '(vpsd', '(vp', '(aux', '(v_nin_pos_aux', 'بتوان)', ')', '(vp', '(np', '(pron', '(pro_demo_sing', 'آن)', ')', '(postp', '(postp', 'را)', ')', ')', '(vp', '(adj', '(aj_sim', 'مدرن)', ')', '(mv', '(n', '(n_sing_com', 'تلقی)', ')', '(v', '(v_sim_pos_pa_3', 'نمود)', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '،)', ')', ')', '(vp', '(adj', '(aj_sim', 'برخوردار)', ')', '(v', '(v_cop_neg_pa_6', 'نبودند)', ')', ')', ')', ')', ')', ')', ')', '(punc', '(punc', '.)', ')', ')', ')'] words = [] in range(len(text)):     if pervious_is_persian:         if re.search(all_persian_letters_r,text[i]) not none:              # current persian             text[i]=text[i-1]+'\u200c'+text[i] #         else:             # current isn't persian              # add text[i-1] , text[i] list             words.append(text[i-1])             words.append(text[i])             pervious_is_persian = false     else: # previous isn't persian         if re.search(all_persian_letters_r,text[i]) not none:             # current persian             pervious_is_persian = true         else: # current isn't persian             words.append(text[i]) print(words) 

i'm sure can improve , implement in program somehow.

edited answer above:
needed?
if so, it's easier not split text in first place:

import re text = fp.read() all_persian_letters_r = r'[آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئؤأإة]' to_change = re.findall(all_persian_letters_r+'+\s',text) in to_change:     text = text.replace(i,i[:-1]+'\u200c')     # 'ح' missing letters. why had     # problems. might wanna add vowels (and else need      # finding) too.     # before edit came hacky way circumvent      # adding more letters better solution. 

Comments

Popular posts from this blog

ubuntu - PHP script to find files of certain extensions in a directory, returns populated array when run in browser, but empty array when run from terminal -

php - How can i create a user dashboard -

javascript - How to detect toggling of the fullscreen-toolbar in jQuery Mobile? -